### Import Necessary Libraries

In [2]:
import numpy as np
import pandas as pd
import os
import torch
import zipfile
import re
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm


import warnings
warnings.filterwarnings("ignore")

### Unzip the file

In [None]:
zip_path = "/content/archive.zip"
extract_path = "/content/sentiment_data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# List extracted files

file_list = os.listdir(extract_path)

### Build a sentiment analysis classifier and evaluate the model

In [6]:
# Set seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Step 1: Load the dataset
df = pd.read_csv('/content/sentiment_data/train.csv',  encoding = 'latin-1')

# Check the first few rows to understand the data structure
print("Dataset preview:")
print(df.head())

# Check for class distribution
print("\nClass distribution:")
print(df['sentiment'].value_counts())

# Step 2: Preprocess the data
# Map sentiment labels to numbers if needed
sentiment_map = {'positive': 0, 'neutral': 1, 'negative': 2}
df['sentiment_label'] = df['sentiment'].map(sentiment_map)

# Split data into train and test sets in a ratio 80:20
X_train, X_test, y_train, y_test = train_test_split(
    df['text'],
    df['sentiment_label'],
    test_size=0.2,
    random_state=seed,
    stratify=df['sentiment_label']
)

# Step 3: Create a PyTorch Dataset
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Step 4: Load pre-trained transformer model and tokenizer
model_name = "distilbert-base-uncased"  # A smaller, faster version of BERT
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3  # positive, neutral, negative
)

# Step 5: Create data loaders
train_dataset = SentimentDataset(X_train, y_train, tokenizer)
test_dataset = SentimentDataset(X_test, y_test, tokenizer)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Step 6: Set up training parameters
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
model.to(device)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)

# Number of training epochs
epochs = 3
total_steps = len(train_loader) * epochs

# Create learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Step 7: Training loop
def train():
    model.train()
    total_train_loss = 0

    for batch in tqdm(train_loader, desc="Training"):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Clear gradients
        model.zero_grad()

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_train_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        # Update learning rate
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)
    return avg_train_loss

# Step 8: Evaluation function
def evaluate():
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            # Get predictions
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            # Store predictions and true labels
            predictions.extend(preds.cpu().tolist())
            true_labels.extend(labels.cpu().tolist())

    # Convert indices back to sentiment names for the classification report
    sentiment_names = {v: k for k, v in sentiment_map.items()}
    y_pred = [sentiment_names[pred] for pred in predictions]
    y_true = [sentiment_names[label] for label in true_labels]

    # Return classification report
    return classification_report(y_true, y_pred)

# Step 9: Train and evaluate the model
print("Starting training...")
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    avg_train_loss = train()
    print(f"Average training loss: {avg_train_loss:.4f}")

# Final evaluation and classification report
print("\nEvaluating the model...")
report = evaluate()
print("\nClassification Report:")
print(report)

# Save the fine-tuned model
model.save_pretrained("./sentiment_transformer_model")
tokenizer.save_pretrained("./sentiment_transformer_model")
print("\nModel saved to ./sentiment_transformer_model")

Dataset preview:
       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment Time of Tweet Age of User  \
0  I`d have responded, if I were going   neutral       morning        0-20   
1                             Sooo SAD  negative          noon       21-30   
2                          bullying me  negative         night       31-45   
3                       leave me alone  negative       morning       46-60   
4                        Sons of ****,  negative          noon       60-70   

       Country  Population -2020  Land Area (Km²)  Density (P/Km²)  
0  Afghanistan          38928346    

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Starting training...

Epoch 1/3


Training: 100%|██████████| 1374/1374 [04:03<00:00,  5.63it/s]


Average training loss: 0.5902

Epoch 2/3


Training: 100%|██████████| 1374/1374 [04:02<00:00,  5.67it/s]


Average training loss: 0.4219

Epoch 3/3


Training: 100%|██████████| 1374/1374 [04:02<00:00,  5.67it/s]


Average training loss: 0.3151

Evaluating the model...


Evaluating: 100%|██████████| 344/344 [00:20<00:00, 16.38it/s]



Classification Report:
              precision    recall  f1-score   support

    negative       0.81      0.80      0.80      1556
     neutral       0.77      0.77      0.77      2224
    positive       0.83      0.84      0.83      1717

    accuracy                           0.80      5497
   macro avg       0.80      0.80      0.80      5497
weighted avg       0.80      0.80      0.80      5497


Model saved to ./sentiment_transformer_model
