# Downloads

# First, as per usual, imports.

In [None]:
import pandas as pd
import torch
import torch.nn.functional as F
import os
import pickle
from torch.utils.data import Dataset, DataLoader
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

# Extracting data.

Mount google drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Custom dataset class for sentiment analysis.

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.reviews = data['Processed Review'].values
        self.labels = data['Sentiment'].values
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        label = int(self.labels[idx])

        # Tokenize and encode the review
        inputs = self.tokenizer(review,
                                max_length=self.max_length,
                                padding='max_length',
                                truncation=True,
                                return_tensors="pt")

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

# Load model and tokenizer

In [None]:
model_name = "tabularisai/robust-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at tabularisai/robust-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Modify the classifier to suit binary classification.

In [None]:
model.classifier = nn.Linear(model.config.hidden_size, 2)

# Load Dataset

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Data-sets/AmazonReviews/preprocessed/vectorized_data_hf.csv")

# First split: 70% training, 30% (which will be split into validation and testing)
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)

# Second split: 20% validation and 10% testing from the 30% temp_data
val_data, test_data = train_test_split(temp_data, test_size=1/3, random_state=42)

# Display the sizes of the splits
print(f"Training data: {len(train_data)}")
print(f"Validation data: {len(val_data)}")
print(f"Testing data: {len(test_data)}")

Data and optimizers.

In [None]:
# Hyperparameters
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
MAX_LEN = 512

# Create Dataset objects
train_dataset = SentimentDataset(train_data, tokenizer, MAX_LEN)
val_dataset = SentimentDataset(val_data, tokenizer, MAX_LEN)

# DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Training

In [None]:
def train_model(model, train_loader, val_loader, epochs, max_steps_per_epoch=500):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")  # Check if CUDA is available
    model = model.to(device)

    for epoch in range(epochs):
        print(f"Starting Epoch {epoch+1}/{epochs}")
        model.train()
        total_train_loss = 0
        steps_done = 0  # Track steps within an epoch

        for step, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            optimizer.step()

            # Print out training progress every few steps
            if step % 50 == 0:  # Print every 50 steps to track progress
                print(f"Epoch: {epoch+1}, Step: {step}, Loss: {loss.item()}")

            # Increment steps and break the loop once max_steps_per_epoch is reached
            steps_done += 1
            if steps_done >= max_steps_per_epoch:
                print(f"Reached max steps for Epoch {epoch+1}. Stopping early.")
                break  # Stop the epoch early

        avg_train_loss = total_train_loss / steps_done

        # Validation after limited steps
        model.eval()
        total_val_loss = 0
        correct_predictions = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_val_loss += loss.item()

                predictions = torch.argmax(outputs.logits, dim=1)
                correct_predictions += torch.sum(predictions == labels)

        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = correct_predictions.double() / len(val_loader.dataset)

        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"Training loss: {avg_train_loss:.3f}")
        print(f"Validation loss: {avg_val_loss:.3f}")
        print(f"Validation accuracy: {val_accuracy:.3f}\n")

    return model

# Fine-tune the model

In [None]:
tuned_model = train_model(model, train_loader, val_loader, EPOCHS, max_steps_per_epoch=100)

Saving the model for future deployment.

In [None]:
# Save the model to a .pkl file
output_dir = '/content/drive/MyDrive/Compiled Models/'
model_filename = "tuned_sentiment_model.pkl"

# Saving the model as a pickle file
with open(os.path.join(output_dir, model_filename), 'wb') as f:
    pickle.dump(tuned_model, f)

print(f"Model saved as {model_filename}")