In [2]:
import pandas as pd
import fasttext
import numpy as np
from tqdm import tqdm  # Import tqdm for progress bar

In [3]:
# Load the data
df_news = pd.read_csv('/Users/macbook/Documents/PhD_Documents/embedding_methods/news_data/news_total_file.csv')

# Drop 'Unnamed: 0' column and remove duplicate values
df_news = df_news.drop(columns=['Unnamed: 0'], errors='ignore').drop_duplicates()

# Ensure the Date column is in datetime format
df_news['Date'] = pd.to_datetime(df_news['Date'], errors='coerce')

# Drop rows where the Date conversion failed
df_news = df_news.dropna(subset=['Date'])

# Sort by Date and format it as YYYY-MM-DD
df_news = df_news.sort_values(by='Date')
df_news['Date'] = df_news['Date'].dt.strftime('%Y-%m-%d')

# Drop duplicates again after formatting Date
df_news = df_news.drop_duplicates()

# Display the cleaned DataFrame
print(df_news)

  df_news['Date'] = pd.to_datetime(df_news['Date'], errors='coerce')


              Date                                               Text
1328    2014-01-01  railcar north dakota crude train crash older l...
1329    2014-01-01  skorea dec crude oil import 84 pct yy prelimin...
1330    2014-01-01  updat 1iraq oil export averag 2341 mln bpd dec...
1331    2014-01-01  iraq oil export averag 2341 mln bpd decemb min...
1332    2014-01-01  brazil petrobra start product roncador field p...
...            ...                                                ...
592597  2024-01-31  us senat committe energi natur resourc full co...
592596  2024-01-31  new jersey natur ga compani new jersey natur g...
592595  2024-01-31  oil lower us crude stock build score first mon...
592665  2024-01-31                    oil price goe brent 8219 barrel
592857  2024-01-31  russia novak current oil price adequ reflect m...

[592858 rows x 2 columns]


In [4]:
# Load the pre-trained FastText model
model = fasttext.load_model('/Users/macbook/Documents/PhD_Documents/Second_paper/Fasttext/cc.en.300.bin')

# Function to embed text
def embed_text(text, model):
    text = str(text)
    return model.get_sentence_vector(text)

# Assuming df_news_1['text'] contains the text data you want to embed

# Initialize an empty list to store embeddings
embeddings = []

# Use tqdm with a generator expression to show progress
for idx, row in tqdm(df_news.iterrows(), total=len(df_news), desc="Embedding Progress"):
    embedded_text = embed_text(row['Text'], model)
    embeddings.append(embedded_text)

# Convert embeddings list to numpy array
embeddings = np.vstack(embeddings)

# Create a new column 'embeddings' in df_news_1 and assign embeddings
df_news['embeddings'] = embeddings.tolist()  # Convert embeddings to list if needed

# Print DataFrame and embeddings
print(df_news.head())
print(embeddings)

Embedding Progress: 100%|████████████| 592858/592858 [00:28<00:00, 21167.68it/s]


            Date                                               Text  \
1328  2014-01-01  railcar north dakota crude train crash older l...   
1329  2014-01-01  skorea dec crude oil import 84 pct yy prelimin...   
1330  2014-01-01  updat 1iraq oil export averag 2341 mln bpd dec...   
1331  2014-01-01  iraq oil export averag 2341 mln bpd decemb min...   
1332  2014-01-01  brazil petrobra start product roncador field p...   

                                             embeddings  
1328  [0.005304390098899603, 0.0010181422112509608, ...  
1329  [-0.014805530197918415, 0.037456776946783066, ...  
1330  [0.014550027437508106, 0.0012422610307112336, ...  
1331  [0.004683321807533503, -0.000922685896512121, ...  
1332  [-0.0034821629524230957, 0.021212242543697357,...  
[[ 0.00530439  0.00101814 -0.00991612 ...  0.0575906   0.0419199
   0.00623459]
 [-0.01480553  0.03745678 -0.02720275 ...  0.10508431  0.04475201
   0.01449827]
 [ 0.01455003  0.00124226 -0.04250472 ...  0.06986248  0.0287494

In [5]:
# Split the embeddings into separate columns
embeddings_df = pd.DataFrame(df_news['embeddings'].tolist(), index=df_news.index)
embeddings_df.columns = [f'feature_{i}' for i in range(embeddings_df.shape[1])]

# Concatenate the original dataframe with the embeddings dataframe
final_df = pd.concat([df_news.drop(columns=['embeddings']), embeddings_df], axis=1)

# Display the final dataframe
print(final_df)

              Date                                               Text  \
1328    2014-01-01  railcar north dakota crude train crash older l...   
1329    2014-01-01  skorea dec crude oil import 84 pct yy prelimin...   
1330    2014-01-01  updat 1iraq oil export averag 2341 mln bpd dec...   
1331    2014-01-01  iraq oil export averag 2341 mln bpd decemb min...   
1332    2014-01-01  brazil petrobra start product roncador field p...   
...            ...                                                ...   
592597  2024-01-31  us senat committe energi natur resourc full co...   
592596  2024-01-31  new jersey natur ga compani new jersey natur g...   
592595  2024-01-31  oil lower us crude stock build score first mon...   
592665  2024-01-31                    oil price goe brent 8219 barrel   
592857  2024-01-31  russia novak current oil price adequ reflect m...   

        feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
1328     0.005304   0.001018  -0.009916   0.061

In [6]:
final_df.to_csv('Fasttext_total_embedding.csv')

In [7]:
# Drop the 'Text' column
final_df = final_df.drop(columns=['Text'])

# Compute the mean of features grouped by the 'Date' column
mean_grouped_df = final_df.groupby('Date').mean()

# Display the resulting dataframe
print(mean_grouped_df)

            feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
Date                                                                           
2014-01-01   0.002360   0.010469  -0.022791   0.041059  -0.024977   0.022706   
2014-01-02  -0.012337   0.006026  -0.019221   0.054330  -0.027131   0.021439   
2014-01-03  -0.012656   0.006681  -0.020059   0.058060  -0.020275   0.015391   
2014-01-04  -0.035902  -0.001695  -0.019933   0.052761  -0.035921   0.068589   
2014-01-05  -0.017930   0.006039  -0.008833   0.061439  -0.027526   0.035032   
...               ...        ...        ...        ...        ...        ...   
2024-01-27  -0.012292   0.002669  -0.009766   0.055353  -0.030776   0.023631   
2024-01-28  -0.016020   0.005003  -0.016132   0.051173  -0.020691   0.022844   
2024-01-29  -0.013944   0.003808  -0.019126   0.051217  -0.020539   0.024478   
2024-01-30  -0.014090   0.004389  -0.018588   0.050058  -0.022026   0.027804   
2024-01-31  -0.012852   0.004556  -0.023

In [8]:
mean_grouped_df.to_csv('Fasttext_mean_embedding.csv')