# **Sentiment analysis**

This Notebook performs sentiment analysis based on our checkpoint1.parquet (see [here](https://T34278926.quickconnect.to/d/s/zpVAefWwFEYfIhTRTc0RfJ1h4rXzh6kJ/7VRz2eFaGxxjR11Xtygq65lAszhLPaIi-7LuAL9qlnQs)) using SiEBERT, a fine-tuned RoBERTa large model (see [here](https://huggingface.co/siebert/sentiment-roberta-large-english))

Note that this Notebook has been run in Kaggle and some code, such as the loading of the data, will have to be changed if you run this Notebook outside of Kaggle.

## Preparation

In [None]:
# Import necessary packages
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np 
import pandas as pd 
from transformers import AutoModelForSequenceClassification, AutoTokenizer, set_seed

# Set seed for reproducible and consistent results
set_seed(42)

In [None]:
# Define file paths (Note: must be changed!)
input = "kaggle_environment_input_path" # change input (4 different split files in total)
output = "senti_split_11.parquet" # change output (4 different split files in total)

# Load the data
df = pd.read_parquet(input)

## Load the Model

In [None]:
# Load the model
checkpoint = 'siebert/sentiment-roberta-large-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

## Define a dataset class

In [None]:
# Define dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=self.max_length)
        return inputs

## Run the sentiment analysis

In [None]:
# Move the model to the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)
model.eval() # Set the model to evaluation mode

# Create a Dataset and DataLoader
texts = df['text'].copy()
dataset = SentimentDataset(texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

predictions = []

with torch.no_grad():
    # Wrap the dataloader with tqdm to track progress
    for batch in tqdm(dataloader, desc="Classifying"):
        # Move batch data to GPU
        inputs = {key: val.squeeze(1).to(device) for key, val in batch.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=1).tolist()
        predictions.extend(batch_predictions)

print("Classification complete.")

## Save the data

In [None]:
# Add predictions to DataFrame
df['label'] = predictions
df.to_parquet(output)