# PRECOMPUTING

## ROBERTA

In [None]:
import os
import torch
import re
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Define the RoBERTa model and tokenizer
ROBERTA_MODEL = 'roberta-large'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained(ROBERTA_MODEL)
roberta_model = RobertaModel.from_pretrained(ROBERTA_MODEL).to(device)

def clean(comment):
    comment = comment.lower()
    comment = re.sub("\\n", " ", comment)
    comment = re.sub("\\r", " ", comment)
    return comment



def precompute_embeddings(data_dir, mode, output_dir):
    # Load data
    data = pd.read_csv(os.path.join(data_dir, f'{mode}_x.csv'), index_col=0)
    data.fillna("", inplace=True)
    data["string"] = data["string"].apply(lambda x: clean(x))

    embeddings = []
    labels = None

    if mode != 'test':
        labels = pd.read_csv(os.path.join(data_dir, f'{mode}_y.csv'))

    # Compute embeddings
    roberta_model.eval()
    for text in tqdm(data["string"].tolist(), desc=f"Processing {mode}"):
        # Tokenize text
        tokens = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        input_ids = tokens["input_ids"].to(device)
        attention_mask = tokens["attention_mask"].to(device)

        # Get RoBERTa embeddings
        with torch.no_grad():
            output = roberta_model(input_ids, attention_mask=attention_mask)
        cls_embedding = output.last_hidden_state[:, 0, :]  # Extract [CLS] token

        embeddings.append(cls_embedding.squeeze(0).cpu())

    # Save embeddings and labels
    embeddings_tensor = torch.stack(embeddings)
    torch.save(embeddings_tensor, os.path.join(output_dir, f"{mode}_embeddings.pt"))

    if labels is not None:
        labels_tensor = torch.tensor(labels['y'].values)
        torch.save(labels_tensor, os.path.join(output_dir, f"{mode}_labels.pt"))

# Define paths
data_dir = "kaggle_data"
output_dir = "embeddings/roberta"
os.makedirs(output_dir, exist_ok=True)

# Precompute for train, val, and test
precompute_embeddings(data_dir, 'train', output_dir)
precompute_embeddings(data_dir, 'val', output_dir)
precompute_embeddings(data_dir, 'test', output_dir)