In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

In [2]:
df_news = pd.read_csv('/Users/macbook/Documents/PhD_Documents/embedding_methods/news_data/news_total_file.csv')
df_news

Unnamed: 0,Date,Text
0,8/16/21,China's daily crude steel output down 4.4 pct ...
1,8/16/21,MONETARIO - Cosa succede oggi lunedì 16 agosto...
2,8/16/21,India's GSPC seeks 18 LNG cargoes for 2022-202...
3,8/16/21,BUZZ-COMMENT-Norges Bank could change the NOK'...
4,8/16/21,MERCADOS GLOBALES-Acciones mundiales caen por ...
...,...,...
592853,1/31/24,govern samoa februari 2024 retail fuel price
592854,1/31/24,quadris fuel intern lonqfi share price pass 20...
592855,1/31/24,hapaglloyd ceo freight rate rise q1 2024 compa...
592856,1/31/24,buzza justifi volatil premium eurusd fx option


In [4]:
import pandas as pd

# Example DataFrame (assuming you have already read your data into df_news)
# df_news = pd.read_csv('your_file.csv')

# Calculate the split point
split_point = len(df_news) // 6

# Create the DataFrames
df_news_1 = df_news.iloc[:split_point]
df_news_2 = df_news.iloc[split_point:2*split_point]
df_news_3 = df_news.iloc[2*split_point:3*split_point]
df_news_4 = df_news.iloc[3*split_point:4*split_point]
df_news_5 = df_news.iloc[4*split_point:5*split_point]
df_news_6 = df_news.iloc[5*split_point:]

# Display the resulting DataFrames
print("DataFrame 1")
print(df_news_1)
print("\nDataFrame 2")
print(df_news_2)
print("\nDataFrame 3")
print(df_news_3)
print("\nDataFrame 4")
print(df_news_4)
print("\nDataFrame 5")
print(df_news_5)
print("\nDataFrame 6")
print(df_news_6)

DataFrame 1
                                                     text date_column
0       railcar north dakota crude train crash older l...  2014-01-01
1       skorea dec crude oil import 84 pct yy prelimin...  2014-01-01
2       updat 1iraq oil export averag 2341 mln bpd dec...  2014-01-01
3       iraq oil export averag 2341 mln bpd decemb min...  2014-01-01
4       brazil petrobra start product roncador field p...  2014-01-01
...                                                   ...         ...
105664                   buzzvalero refin due report week  2016-10-24
105665  updat 1u natga fall near 7week low light heat ...  2016-10-24
105666              enterpris restart seaway twin pipelin  2016-10-24
105667    updat 1alberta mull set green electr target law  2016-10-24
105668  updat 9oil dip buzzard restart iraq us crude t...  2016-10-24

[105669 rows x 2 columns]

DataFrame 2
                                                     text date_column
105669                   skoreamarket 

In [3]:
# Step 2: Load the FinBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = AutoModel.from_pretrained('yiyanghkust/finbert-tone')

# Check if GPU is available and move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Step 3: Define a function to get embeddings with different pooling strategies
def get_embedding(text, strategy="cls"):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**tokens)
    token_embeddings = outputs.last_hidden_state
    attention_mask = tokens['attention_mask']
    
    if strategy == "cls":
        # [CLS] token embedding
        cls_embedding = token_embeddings[:, 0, :]
        return cls_embedding.squeeze().tolist()
    elif strategy == "mean":
        # Mean pooling
        mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
        token_embeddings_masked = token_embeddings.masked_fill(mask_expanded == 0, 0)
        sum_embeddings = torch.sum(token_embeddings_masked, dim=1)
        sum_mask = mask_expanded.sum(dim=1)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings.squeeze().tolist()
    elif strategy == "max":
        # Max pooling
        mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
        token_embeddings_masked = token_embeddings.masked_fill(mask_expanded == 0, -1e9)
        max_embeddings = torch.max(token_embeddings_masked, dim=1).values
        return max_embeddings.squeeze().tolist()
    elif strategy == "sum":
        # Sum pooling
        mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
        token_embeddings_masked = token_embeddings.masked_fill(mask_expanded == 0, 0)
        sum_embeddings = torch.sum(token_embeddings_masked, dim=1)
        return sum_embeddings.squeeze().tolist()
    else:
        raise ValueError(f"Unknown pooling strategy: {strategy}")

# Step 4: Apply the function to your dataframe and format the embeddings
pooling_strategy = "mean"  # Change this to "mean", "max", or "sum" for other strategies
df_news['embeddings'] = [get_embedding(text, strategy=pooling_strategy) for text in tqdm(df_news['Text'], desc="Processing Text")]

  Referenced from: <9A4710B9-0DA3-36BB-9129-645F282E64B2> /Users/macbook/anaconda3/lib/python3.10/site-packages/torchvision/image.so
  warn(
Processing Text: 100%|████████████████| 592858/592858 [6:12:55<00:00, 26.50it/s]


In [4]:
# Split the embeddings into separate columns
embeddings_df = pd.DataFrame(df_news['embeddings'].tolist(), index=df_news.index)
embeddings_df.columns = [f'feature_{i}' for i in range(embeddings_df.shape[1])]

# Concatenate the original dataframe with the embeddings dataframe
final_df = pd.concat([df_news.drop(columns=['embeddings']), embeddings_df], axis=1)

# Display the final dataframe
print(final_df)

           Date                                               Text  feature_0  \
0       8/16/21  China's daily crude steel output down 4.4 pct ...  -0.478040   
1       8/16/21  MONETARIO - Cosa succede oggi lunedì 16 agosto...  -0.458494   
2       8/16/21  India's GSPC seeks 18 LNG cargoes for 2022-202...  -0.906667   
3       8/16/21  BUZZ-COMMENT-Norges Bank could change the NOK'...  -0.730406   
4       8/16/21  MERCADOS GLOBALES-Acciones mundiales caen por ...  -0.473542   
...         ...                                                ...        ...   
592853  1/31/24       govern samoa februari 2024 retail fuel price  -0.756381   
592854  1/31/24  quadris fuel intern lonqfi share price pass 20...   0.053709   
592855  1/31/24  hapaglloyd ceo freight rate rise q1 2024 compa...   0.093769   
592856  1/31/24     buzza justifi volatil premium eurusd fx option  -0.184825   
592857  1/31/24  russia novak current oil price adequ reflect m...  -1.103157   

        feature_1  feature_

In [7]:
final_df = final_df.drop_duplicates()
final_df

Unnamed: 0,Date,headlines_text,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_758,feature_759,feature_760,feature_761,feature_762,feature_763,feature_764,feature_765,feature_766,feature_767
0,8/16/21,China's daily crude steel output down 4.4 pct ...,-0.478040,-0.206937,-0.237913,0.625718,1.206705,-0.319898,0.181293,0.294610,...,-0.179894,0.129155,0.463228,-0.469056,0.201067,0.246235,0.490512,-0.063636,-0.243678,0.440305
1,8/16/21,MONETARIO - Cosa succede oggi lunedì 16 agosto...,-0.458494,-0.911227,-0.262384,0.902497,0.707238,-0.443270,0.065684,0.066894,...,0.589834,0.348195,0.562928,-0.216789,0.565013,0.239647,0.163024,0.124609,-0.413840,0.637806
2,8/16/21,India's GSPC seeks 18 LNG cargoes for 2022-202...,-0.906667,-0.404089,0.071872,0.701861,1.508246,-0.985673,0.359188,0.393654,...,0.165708,0.216833,0.622468,-0.505837,0.061912,0.306290,-0.069731,-0.212861,-0.314667,0.435713
3,8/16/21,BUZZ-COMMENT-Norges Bank could change the NOK'...,-0.730406,-0.364585,-0.198608,0.763500,1.315975,-0.822187,0.379544,0.399916,...,-0.039284,0.273881,0.334114,-0.665618,0.081358,0.044733,0.111654,-0.218350,-0.198458,0.620108
4,8/16/21,MERCADOS GLOBALES-Acciones mundiales caen por ...,-0.473542,-0.403810,-0.411597,0.240100,0.685240,-0.490537,-0.273956,0.075465,...,0.035327,0.214341,0.462394,-0.354211,0.586436,-0.387977,0.069431,-0.108758,-0.060990,0.272730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1323,12/31/21,Ecuador reinicia bombeo en oleoducto estatal S...,-0.348012,-0.401753,-0.225240,0.627184,0.678730,-0.552752,-0.329053,-0.216153,...,0.153721,-0.056699,0.437953,-0.388083,0.256831,0.152160,-0.186199,-0.097916,-0.142432,0.769660
1324,12/31/21,Briefing.com: Economic Calendar - 11:00 ET Bri...,-0.339153,-0.574426,-0.351661,0.710120,0.538819,-0.767029,-0.097468,-0.300237,...,0.092711,-0.203939,0.334416,-0.247252,0.342119,0.201984,0.077758,-0.128292,-0.148337,0.729418
1325,12/31/21,TABLE-U.S. oils and fats - Dec 31 Briefing.com...,-0.014145,-0.653742,-0.101783,0.423280,0.794195,-0.449417,-0.228989,-0.117952,...,0.214811,-0.069311,0.518744,-0.225329,0.124772,0.382300,-0.049314,0.129456,-0.315121,0.953014
1326,12/31/21,"GRAPHIC-Global Markets in 2021: Recoveries, re...",0.016812,-0.628152,-0.013515,0.539754,0.692239,-0.373455,-0.111073,-0.380987,...,0.459806,-0.028528,0.678263,-0.278288,-0.022508,0.598848,0.017901,-0.049136,-0.394390,0.626624


In [5]:
# Assuming df_news already exists and contains the 'Date' column
# Convert 'Date' column to datetime format with y-m-d format
final_df['Date'] = pd.to_datetime(final_df['Date'], format='%m/%d/%y')

# Sort the dataframe by the 'Date' column
final_df = final_df.sort_values(by='Date')
final_df

Unnamed: 0,Date,Text,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_758,feature_759,feature_760,feature_761,feature_762,feature_763,feature_764,feature_765,feature_766,feature_767
1328,2014-01-01,railcar north dakota crude train crash older l...,-0.117663,-0.648179,-0.380194,1.074427,0.850166,-0.361610,-0.377292,-0.144328,...,-0.334138,0.713372,0.627144,-0.208887,0.030082,0.104941,-0.874622,0.294833,-0.188096,0.177770
1329,2014-01-01,skorea dec crude oil import 84 pct yy prelimin...,-0.415424,-1.390731,-0.441689,0.577448,0.769293,-0.586521,0.166580,0.147339,...,-0.244302,-0.105316,-0.288068,0.072274,0.764244,0.039801,0.403113,0.027078,0.007685,0.461774
1330,2014-01-01,updat 1iraq oil export averag 2341 mln bpd dec...,-0.353768,-0.755624,-0.989849,0.753098,0.748832,-0.606757,-0.084129,-0.124299,...,-0.442467,0.547077,0.046707,-0.088854,0.948733,-0.436672,-0.082338,-0.147943,-0.455893,0.659583
1331,2014-01-01,iraq oil export averag 2341 mln bpd decemb min...,-0.092134,-0.648626,-0.721545,0.836451,0.364822,-0.739531,-0.017598,0.057717,...,-0.421450,0.841109,0.011853,-0.219272,1.304576,-0.307614,-0.027879,0.027619,-0.463990,0.630149
1332,2014-01-01,brazil petrobra start product roncador field p...,-0.481013,-0.381337,-0.403017,1.413309,0.714893,-0.474234,-0.088604,-0.079588,...,0.682829,0.424708,0.092385,-0.331078,0.533973,-0.092779,0.364675,0.754015,-0.314879,0.320257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
592597,2024-01-31,us senat committe energi natur resourc full co...,0.105376,-0.464036,-0.178585,0.703063,0.845907,-0.355575,-0.013659,-0.267397,...,0.214506,0.619455,0.109946,-0.107634,0.870061,-0.483972,-0.177516,0.136487,0.059178,0.603373
592596,2024-01-31,new jersey natur ga compani new jersey natur g...,-0.072483,-0.353813,0.117226,0.457536,0.652683,-1.039679,0.366653,-0.890763,...,0.384625,-0.591737,0.551671,-0.432489,0.765853,0.037079,0.022143,-0.372637,0.456148,-0.073149
592595,2024-01-31,oil lower us crude stock build score first mon...,-0.346903,-0.874956,0.172102,1.079383,0.324581,-0.366034,-0.604182,0.531443,...,0.733527,0.355386,0.439683,-0.527491,0.556126,0.458873,-0.169520,0.073896,-0.593351,0.042039
592665,2024-01-31,oil price goe brent 8219 barrel,-0.876426,-1.584615,-0.571842,0.812784,0.794509,-1.155014,-0.360731,0.520648,...,-0.497745,-0.306834,-0.030179,0.310890,0.692007,0.764428,-0.135121,0.086822,-0.082296,-0.128443


In [6]:
final_df.to_csv('FinBERT_total_embedding.csv')

In [7]:
# Drop the 'Text' column
final_df = final_df.drop(columns=['Text'])

# Compute the mean of features grouped by the 'Date' column
mean_grouped_df = final_df.groupby('Date').mean()

# Display the resulting dataframe
print(mean_grouped_df)

            feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
Date                                                                           
2014-01-01  -0.292474  -0.656689  -0.647653   0.869449   0.602404  -0.648666   
2014-01-02  -0.214517  -0.479093  -0.393853   0.894579   0.608936  -0.564948   
2014-01-03  -0.191357  -0.570357  -0.379774   0.855505   0.669799  -0.647717   
2014-01-04  -0.222828  -0.552208  -0.273698   1.352581   0.651369  -0.530675   
2014-01-05  -0.254776  -0.367818  -0.648310   1.049372   0.622345  -0.788005   
...               ...        ...        ...        ...        ...        ...   
2024-01-27  -0.217303  -0.314629  -0.389391   0.938252   0.680919  -0.733525   
2024-01-28  -0.117719  -0.398255  -0.346025   0.842325   0.801929  -0.584134   
2024-01-29  -0.217840  -0.464755  -0.270208   0.838246   0.663836  -0.572238   
2024-01-30  -0.170901  -0.519105  -0.306044   0.840018   0.673540  -0.544370   
2024-01-31  -0.157458  -0.554885  -0.288

In [8]:
mean_grouped_df.to_csv('FinBERT_mean_embedding.csv')