In [10]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel

torch.cuda.empty_cache()

# Define the mean pooling function
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Dataloader for embedding loop
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __getitem__(self, index):
        text = self.texts[index]
        return text

    def __len__(self):
        return len(self.texts)

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

df = pd.read_csv('Data/PreProcessedData.csv')

# Concatenate the 'Title' and 'Plot' columns
texts = df['Title'] + ' ' + df['Plot']

# Create a custom dataset and dataloader
dataset = MyDataset(texts)
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

# Move the model to GPU
model.to('cuda')

# Compute the embeddings for the texts
embeddings = []
with torch.no_grad():
    for batch in dataloader:
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt').to('cuda')
        model_output = model(**encoded_input)
        embedding = mean_pooling(model_output, encoded_input['attention_mask'])
        embeddings.append(embedding)
embeddings = torch.cat(embeddings, dim=0)
embeddings = F.normalize(embeddings, p=2, dim=1)

# Move the embeddings to CPU and convert to a DataFrame
df_embeddings = pd.DataFrame(embeddings.to('cpu').numpy())
df = df.drop('Plot', axis=1)
df = df.drop('Title', axis=1)

df_concatenated = pd.concat([df, df_embeddings], axis=1)
# Save the concatenated DataFrame to a CSV file
df_concatenated.to_csv('Data/PreProcessedData_with_HF_embeddings.csv', index=False)
