In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# Load the data
df_news = pd.read_csv('/Users/macbook/Documents/PhD_Documents/embedding_methods/news_data/news_total_file.csv')

# Drop 'Unnamed: 0' column and remove duplicate values
df_news = df_news.drop(columns=['Unnamed: 0'], errors='ignore').drop_duplicates()

# Ensure the Date column is in datetime format
df_news['Date'] = pd.to_datetime(df_news['Date'], errors='coerce')

# Drop rows where the Date conversion failed
df_news = df_news.dropna(subset=['Date'])

# Sort by Date and format it as YYYY-MM-DD
df_news = df_news.sort_values(by='Date')
df_news['Date'] = df_news['Date'].dt.strftime('%Y-%m-%d')

# Drop duplicates again after formatting Date
df_news = df_news.drop_duplicates()

# Display the cleaned DataFrame
print(df_news)

  df_news['Date'] = pd.to_datetime(df_news['Date'], errors='coerce')


              Date                                               Text
1328    2014-01-01  railcar north dakota crude train crash older l...
1329    2014-01-01  skorea dec crude oil import 84 pct yy prelimin...
1330    2014-01-01  updat 1iraq oil export averag 2341 mln bpd dec...
1331    2014-01-01  iraq oil export averag 2341 mln bpd decemb min...
1332    2014-01-01  brazil petrobra start product roncador field p...
...            ...                                                ...
592597  2024-01-31  us senat committe energi natur resourc full co...
592596  2024-01-31  new jersey natur ga compani new jersey natur g...
592595  2024-01-31  oil lower us crude stock build score first mon...
592665  2024-01-31                    oil price goe brent 8219 barrel
592857  2024-01-31  russia novak current oil price adequ reflect m...

[592858 rows x 2 columns]


In [2]:
# Split the data into 6 parts (if needed)
split_point = len(df_news) // 6
df_news_1 = df_news.iloc[:split_point]
df_news_2 = df_news.iloc[split_point:2*split_point]
df_news_3 = df_news.iloc[2*split_point:3*split_point]
df_news_4 = df_news.iloc[3*split_point:4*split_point]
df_news_5 = df_news.iloc[4*split_point:5*split_point]
df_news_6 = df_news.iloc[5*split_point:]

In [2]:
# Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Check if GPU is available and move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define a function to get embeddings with different pooling strategies
def get_embedding(text, strategy="cls"):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**tokens)
    token_embeddings = outputs.last_hidden_state
    attention_mask = tokens['attention_mask']
    
    if strategy == "cls":
        # [CLS] token embedding
        cls_embedding = token_embeddings[:, 0, :]
        return cls_embedding.squeeze().tolist()
    elif strategy == "mean":
        # Mean pooling
        mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
        token_embeddings_masked = token_embeddings.masked_fill(mask_expanded == 0, 0)
        sum_embeddings = torch.sum(token_embeddings_masked, dim=1)
        sum_mask = mask_expanded.sum(dim=1)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings.squeeze().tolist()
    elif strategy == "max":
        # Max pooling
        mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
        token_embeddings_masked = token_embeddings.masked_fill(mask_expanded == 0, -1e9)
        max_embeddings = torch.max(token_embeddings_masked, dim=1).values
        return max_embeddings.squeeze().tolist()
    elif strategy == "sum":
        # Sum pooling
        mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
        token_embeddings_masked = token_embeddings.masked_fill(mask_expanded == 0, 0)
        sum_embeddings = torch.sum(token_embeddings_masked, dim=1)
        return sum_embeddings.squeeze().tolist()
    else:
        raise ValueError(f"Unknown pooling strategy: {strategy}")

# Apply the embedding function to the 6th part of the dataframe
pooling_strategy = "mean"  # Change to "cls", "max", or "sum" for other strategies
df_news['embeddings'] = [get_embedding(text, strategy=pooling_strategy) for text in tqdm(df_news['Text'], desc="Processing Text")]

  Referenced from: <9A4710B9-0DA3-36BB-9129-645F282E64B2> /Users/macbook/anaconda3/lib/python3.10/site-packages/torchvision/image.so
  warn(
Processing Text: 100%|████████████████| 592858/592858 [7:12:31<00:00, 22.85it/s]


In [4]:
df_news

Unnamed: 0,Date,Text,embeddings
1328,2014-01-01,railcar north dakota crude train crash older l...,"[0.28228703141212463, -0.3216705620288849, -0...."
1329,2014-01-01,skorea dec crude oil import 84 pct yy prelimin...,"[-0.11626404523849487, -0.01936078630387783, 0..."
1330,2014-01-01,updat 1iraq oil export averag 2341 mln bpd dec...,"[-0.24160714447498322, -0.2372460812330246, 0...."
1331,2014-01-01,iraq oil export averag 2341 mln bpd decemb min...,"[-0.19026753306388855, -0.043890636414289474, ..."
1332,2014-01-01,brazil petrobra start product roncador field p...,"[0.03664577379822731, -0.08797629922628403, 0...."
...,...,...,...
592597,2024-01-31,us senat committe energi natur resourc full co...,"[-0.090078204870224, -0.0732942745089531, 0.33..."
592596,2024-01-31,new jersey natur ga compani new jersey natur g...,"[-0.03526687994599342, -0.4973338842391968, 0...."
592595,2024-01-31,oil lower us crude stock build score first mon...,"[0.0017135110683739185, -0.4250740110874176, 0..."
592665,2024-01-31,oil price goe brent 8219 barrel,"[-0.07852102816104889, -0.4520896077156067, 0...."


In [5]:
# Split the embeddings into separate columns and concatenate with the original dataframe
final_df = pd.concat(
    [df_news.drop(columns=['embeddings']),
     pd.DataFrame(df_news['embeddings'].values.tolist(), index=df_news.index)
      .rename(columns=lambda i: f'feature_{i}')],
    axis=1
)

# Display the final dataframe
print(final_df)

              Date                                               Text  \
1328    2014-01-01  railcar north dakota crude train crash older l...   
1329    2014-01-01  skorea dec crude oil import 84 pct yy prelimin...   
1330    2014-01-01  updat 1iraq oil export averag 2341 mln bpd dec...   
1331    2014-01-01  iraq oil export averag 2341 mln bpd decemb min...   
1332    2014-01-01  brazil petrobra start product roncador field p...   
...            ...                                                ...   
592597  2024-01-31  us senat committe energi natur resourc full co...   
592596  2024-01-31  new jersey natur ga compani new jersey natur g...   
592595  2024-01-31  oil lower us crude stock build score first mon...   
592665  2024-01-31                    oil price goe brent 8219 barrel   
592857  2024-01-31  russia novak current oil price adequ reflect m...   

        feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
1328     0.282287  -0.321671  -0.167749   0.193

In [6]:
final_df.to_csv('Bert_total_embedding.csv')

In [7]:
# Drop the 'Text' column
final_df = final_df.drop(columns=['Text'])

# Compute the mean of features grouped by the 'Date' column
mean_grouped_df = final_df.groupby('Date').mean()

# Display the resulting dataframe
print(mean_grouped_df)

            feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
Date                                                                           
2014-01-01  -0.043475  -0.164988   0.183225   0.056277   0.375607  -0.029788   
2014-01-02  -0.035107  -0.170385   0.150354   0.019579   0.319369  -0.060770   
2014-01-03  -0.047644  -0.170381   0.165911   0.066791   0.340221  -0.021383   
2014-01-04   0.032417  -0.075082  -0.047487  -0.078541   0.318565  -0.087398   
2014-01-05   0.007307  -0.161989   0.147542  -0.184680   0.218439  -0.140416   
...               ...        ...        ...        ...        ...        ...   
2024-01-27  -0.084876  -0.236796   0.185873  -0.025501   0.356431  -0.021305   
2024-01-28   0.009375  -0.175774   0.186066  -0.032545   0.344828  -0.078430   
2024-01-29  -0.046906  -0.213831   0.225373   0.012302   0.337351  -0.050423   
2024-01-30  -0.036904  -0.209402   0.223687   0.039722   0.343290  -0.059763   
2024-01-31  -0.085764  -0.225207   0.244

In [8]:
mean_grouped_df.to_csv('BERT_mean_embedding.csv')