In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm  # For the progress bar

# Load the pretrained tokenizer and model from your local directory
tokenizer = AutoTokenizer.from_pretrained(r"C:\Users\rh23592\OneDrive - University of Essex\Documents\fintext\2023")
model = AutoModel.from_pretrained(r"C:\Users\rh23592\OneDrive - University of Essex\Documents\fintext\2023")

# Example DataFrame (replace with your actual DataFrame)
df = pd.read_csv(r"C:\Users\rh23592\OneDrive - University of Essex\Documents\news_data\news_data_2023.csv")

# Preprocess the text column (ensure there are no missing values)
df['Text'] = df['Text'].fillna("")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Initialize an empty list to store embeddings
embeddings = []

# Process data in batches
batch_size = 32
max_length = 512  # Maximum token length for most transformer models

for i in tqdm(range(0, len(df), batch_size), desc="Processing batches"):
    # Select the batch
    batch_texts = df['Text'][i:i + batch_size].tolist()
    
    # Tokenize the text data with truncation and padding to max_length
    encoded_input = tokenizer(
        batch_texts, 
        padding=True, 
        truncation=True, 
        max_length=max_length, 
        return_tensors="pt"
    )
    encoded_input = {key: val.to(device) for key, val in encoded_input.items()}
    
    try:
        # Extract embeddings
        with torch.no_grad():
            outputs = model(**encoded_input)
        
        # Perform mean pooling on the last hidden state to get sentence embeddings
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    except RuntimeError as e:
        print(f"Error processing batch {i//batch_size + 1}: {e}")
        embeddings.extend([None] * len(batch_texts))  # Append None for failed batches

# Add embeddings back to the DataFrame
df['embedding'] = embeddings

# Save the DataFrame to a CSV file
#df.to_csv("text_with_embeddings.csv", index=False)

# Print the resulting DataFrame
print(df.head())

Some weights of RobertaModel were not initialized from the model checkpoint at C:\Users\rh23592\OneDrive - University of Essex\Documents\fintext\2023 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing batches: 100%|████████████████████████████████████████████████████████| 2814/2814 [1:12:10<00:00,  1.54s/it]

   Unnamed: 0        Date                                               Text  \
0      495440  2023-01-01  announc result deliveri procedur deliveri heat...   
1      495439  2023-01-01  hero 2022 bowel cancer campaign ukrainian girl...   
2      495438  2023-01-01  1 plugin hybrid gasolin suv type vehicl 5 seat...   
3      495437  2023-01-01  procur petroleum product fuel oil need zadar c...   
4      495436  2023-01-01  52022zp deliveri heat oil need local boiler ho...   

                                           embedding  
0  [-0.042821914, 0.2182332, -0.14395194, 0.64695...  
1  [0.15033486, 0.051737614, -0.12575993, 0.37026...  
2  [-0.40045, -0.38085246, -0.33560446, 0.3464156...  
3  [0.24712259, -0.14612733, -0.0019628839, 0.684...  
4  [0.16381882, 0.15227015, -0.5902, 0.5538616, 0...  





In [2]:
#df = df.drop("Unnamed: 0", axis = 1)
embeddings_df = pd.DataFrame(df["embedding"].tolist(), index = df.index)
embeddings_df.columns = [f"feature_{i}" for i in range(embeddings_df.shape[1])]

final_df = pd.concat([df.drop(columns=["embedding"]), embeddings_df], axis = 1)

print(final_df)

       Unnamed: 0        Date  \
0          495440  2023-01-01   
1          495439  2023-01-01   
2          495438  2023-01-01   
3          495437  2023-01-01   
4          495436  2023-01-01   
...           ...         ...   
90035      585375  2023-12-31   
90036      585388  2023-12-31   
90037      585374  2023-12-31   
90038      585373  2023-12-31   
90039      585379  2023-12-31   

                                                    Text  feature_0  \
0      announc result deliveri procedur deliveri heat...  -0.042822   
1      hero 2022 bowel cancer campaign ukrainian girl...   0.150335   
2      1 plugin hybrid gasolin suv type vehicl 5 seat...  -0.400450   
3      procur petroleum product fuel oil need zadar c...   0.247123   
4      52022zp deliveri heat oil need local boiler ho...   0.163819   
...                                                  ...        ...   
90035  china opec biggest wildcard could jolt oil mar...  -0.028447   
90036  suppli fuel oil 5w30 oxygen 

In [3]:
final_df = final_df.sort_values(by = "Date")
final_df

Unnamed: 0.1,Unnamed: 0,Date,Text,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,feature_758,feature_759,feature_760,feature_761,feature_762,feature_763,feature_764,feature_765,feature_766,feature_767
0,495440,2023-01-01,announc result deliveri procedur deliveri heat...,-0.042822,0.218233,-0.143952,0.646957,0.182093,0.472903,-0.043371,...,-0.373033,0.521988,-0.510916,-0.288155,-0.029999,0.422082,0.033328,0.262416,-0.527881,-0.048078
55,495421,2023-01-01,lot 1 suppli diesel fuel gasolin urea lpg rech...,-0.041690,-0.331223,-0.253159,0.439037,0.113962,0.147843,-0.325630,...,-0.133624,0.486727,-0.252291,-0.217611,-0.188324,0.236260,-0.202189,0.164668,-0.219073,-0.223311
54,495385,2023-01-01,russian domin india crude market continu yr ex...,0.098502,-0.035888,0.230188,0.561755,-0.106705,0.120313,0.022220,...,-0.504271,0.072504,-0.248995,-0.290049,0.143491,0.240238,0.229553,-0.357340,-0.185044,-0.100550
53,495386,2023-01-01,russia attack kyiv drone inscrib happi new yea...,-0.090596,0.387234,0.206693,0.856164,-0.024999,0.164579,0.034192,...,-0.539045,0.187224,-0.633054,-0.415563,-0.293463,0.469894,0.015257,-0.058353,-0.336245,0.000428
52,495387,2023-01-01,record high product china major shale ga field...,-0.022994,0.243742,0.131706,0.435460,0.128023,0.226159,-0.227964,...,-0.540802,0.228061,-0.651874,-0.549414,0.248199,0.250395,-0.011927,-0.013406,-0.405962,-0.333962
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89989,585423,2023-12-31,hire 4 no jcb machin 12 no tractor trolley 15 ...,-0.027840,0.600322,-0.407891,0.786243,-0.013130,0.300346,-0.488286,...,-0.211848,0.285858,-0.319730,-0.139128,0.122624,0.253449,0.104553,-0.131112,-0.151079,0.169671
89988,585390,2023-12-31,erect polic camp bunker shed along scissor bar...,-0.085768,0.547083,-0.430179,0.671273,0.006877,0.283318,-0.130087,...,-0.264949,0.248298,-0.394134,-0.260852,0.135683,0.173504,0.125527,-0.125005,-0.297976,-0.015575
89987,585409,2023-12-31,suppli weapon rack 12 rifl q3 qti 2,0.149270,-0.008246,0.075660,0.711110,0.426408,0.383837,-0.160298,...,-0.369641,0.240133,-0.291004,-0.391263,0.125007,0.114450,0.073677,0.077537,-0.156153,-0.259038
89994,585408,2023-12-31,annual mainten contract tank farm manag system...,-0.165460,0.129518,-0.244681,0.688286,0.016587,0.550566,-0.299674,...,-0.518595,0.840247,-0.099561,-0.049415,0.085518,0.714390,0.150469,0.049922,-0.487970,-0.177669


In [4]:
final_df = final_df.drop_duplicates()
final_df

Unnamed: 0.1,Unnamed: 0,Date,Text,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,feature_758,feature_759,feature_760,feature_761,feature_762,feature_763,feature_764,feature_765,feature_766,feature_767
0,495440,2023-01-01,announc result deliveri procedur deliveri heat...,-0.042822,0.218233,-0.143952,0.646957,0.182093,0.472903,-0.043371,...,-0.373033,0.521988,-0.510916,-0.288155,-0.029999,0.422082,0.033328,0.262416,-0.527881,-0.048078
55,495421,2023-01-01,lot 1 suppli diesel fuel gasolin urea lpg rech...,-0.041690,-0.331223,-0.253159,0.439037,0.113962,0.147843,-0.325630,...,-0.133624,0.486727,-0.252291,-0.217611,-0.188324,0.236260,-0.202189,0.164668,-0.219073,-0.223311
54,495385,2023-01-01,russian domin india crude market continu yr ex...,0.098502,-0.035888,0.230188,0.561755,-0.106705,0.120313,0.022220,...,-0.504271,0.072504,-0.248995,-0.290049,0.143491,0.240238,0.229553,-0.357340,-0.185044,-0.100550
53,495386,2023-01-01,russia attack kyiv drone inscrib happi new yea...,-0.090596,0.387234,0.206693,0.856164,-0.024999,0.164579,0.034192,...,-0.539045,0.187224,-0.633054,-0.415563,-0.293463,0.469894,0.015257,-0.058353,-0.336245,0.000428
52,495387,2023-01-01,record high product china major shale ga field...,-0.022994,0.243742,0.131706,0.435460,0.128023,0.226159,-0.227964,...,-0.540802,0.228061,-0.651874,-0.549414,0.248199,0.250395,-0.011927,-0.013406,-0.405962,-0.333962
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89989,585423,2023-12-31,hire 4 no jcb machin 12 no tractor trolley 15 ...,-0.027840,0.600322,-0.407891,0.786243,-0.013130,0.300346,-0.488286,...,-0.211848,0.285858,-0.319730,-0.139128,0.122624,0.253449,0.104553,-0.131112,-0.151079,0.169671
89988,585390,2023-12-31,erect polic camp bunker shed along scissor bar...,-0.085768,0.547083,-0.430179,0.671273,0.006877,0.283318,-0.130087,...,-0.264949,0.248298,-0.394134,-0.260852,0.135683,0.173504,0.125527,-0.125005,-0.297976,-0.015575
89987,585409,2023-12-31,suppli weapon rack 12 rifl q3 qti 2,0.149270,-0.008246,0.075660,0.711110,0.426408,0.383837,-0.160298,...,-0.369641,0.240133,-0.291004,-0.391263,0.125007,0.114450,0.073677,0.077537,-0.156153,-0.259038
89994,585408,2023-12-31,annual mainten contract tank farm manag system...,-0.165460,0.129518,-0.244681,0.688286,0.016587,0.550566,-0.299674,...,-0.518595,0.840247,-0.099561,-0.049415,0.085518,0.714390,0.150469,0.049922,-0.487970,-0.177669


In [5]:
final_df.to_csv("Fintext_2023.csv")