In [1]:
# -----------------------------
# Modeling Notebook - SentimentSense

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, classification_report

# VADER for baseline
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

# Transformers for RoBERTa
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Save models
import joblib

sns.set_style("whitegrid")
%matplotlib inline


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Rajit\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Load preprocessed dataset
processed_path = "../data/processed/processed_reviews.csv"
df = pd.read_csv(processed_path)

# Map sentiment to numeric labels
sentiment_map = {"negative":0, "neutral":1, "positive":2}
df['label'] = df['sentiment'].map(sentiment_map)

# Train-validation-test split
X_train, X_temp, y_train, y_temp = train_test_split(df['cleaned_text'], df['label'], 
                                                    test_size=0.3, random_state=42, stratify=df['label'])
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, 
                                                test_size=0.5, random_state=42, stratify=y_temp)

print("Train:", X_train.shape, "Validation:", X_val.shape, "Test:", X_test.shape)


Train: (3500,) Validation: (750,) Test: (750,)


In [3]:
# Initialize VADER as baseline
sid = SentimentIntensityAnalyzer()

# Function to map VADER compound score to sentiment
def vader_sentiment(text):
    score = sid.polarity_scores(text)['compound']
    if score >= 0.05:
        return 2  # positive
    elif score <= -0.05:
        return 0  # negative
    else:
        return 1  # neutral

# Predict on test set
vader_preds = X_test.apply(vader_sentiment)

# Evaluate
print("VADER Baseline Accuracy:", accuracy_score(y_test, vader_preds))
print("Classification Report:\n", classification_report(y_test, vader_preds, target_names=['negative','neutral','positive']))


VADER Baseline Accuracy: 0.6533333333333333
Classification Report:
               precision    recall  f1-score   support

    negative       0.67      0.62      0.64        84
     neutral       0.28      0.41      0.33       147
    positive       0.83      0.73      0.77       519

    accuracy                           0.65       750
   macro avg       0.59      0.59      0.58       750
weighted avg       0.70      0.65      0.67       750



In [4]:
# logistice regression TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

# Logistic Regression
lr = LogisticRegression(max_iter=500)
lr.fit(X_train_tfidf, y_train)

# Predict & evaluate
lr_preds = lr.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_preds))
print("Classification Report:\n", classification_report(y_test, lr_preds, target_names=['negative','neutral','positive']))

# Save model
joblib.dump(lr, "../src/models/logreg_model.pkl")
joblib.dump(tfidf, "../src/models/tfidf_vectorizer.pkl")


Logistic Regression Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

    negative       1.00      1.00      1.00        84
     neutral       1.00      1.00      1.00       147
    positive       1.00      1.00      1.00       519

    accuracy                           1.00       750
   macro avg       1.00      1.00      1.00       750
weighted avg       1.00      1.00      1.00       750



['../src/models/tfidf_vectorizer.pkl']

In [5]:
# Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_tfidf, y_train)

# Predict & evaluate
rf_preds = rf.predict(X_test_tfidf)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))
print("Classification Report:\n", classification_report(y_test, rf_preds, target_names=['negative','neutral','positive']))

# Save model
joblib.dump(rf, "../src/models/rf_model.pkl")


Random Forest Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

    negative       1.00      1.00      1.00        84
     neutral       1.00      1.00      1.00       147
    positive       1.00      1.00      1.00       519

    accuracy                           1.00       750
   macro avg       1.00      1.00      1.00       750
weighted avg       1.00      1.00      1.00       750



['../src/models/rf_model.pkl']

In [7]:
# RoBERTa Fine-Tuning
# Tokenizer & Dataset

# -----------------------------
# Using  Lightweight Model(For demonstration, we used lightweight DistilRoBERTa to 
# fine-tune locally. Full RoBERTa training needto be done on cloud GPU for production.)

model_name = "distilroberta-base"  # Lighter than roberta-base
roberta_tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

def tokenize(batch):
    return roberta_tokenizer(batch['text'], padding='max_length', truncation=True, max_length=64)

# 3. Prepare Datasets
train_dataset = Dataset.from_dict({'text': X_train.tolist(), 'label': y_train.tolist()})
val_dataset = Dataset.from_dict({'text': X_val.tolist(), 'label': y_val.tolist()})

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Optional: use a subset for super fast demo
train_dataset_small = train_dataset.select(range(min(2000, len(train_dataset))))
val_dataset_small = val_dataset.select(range(min(500, len(val_dataset))))

# Training Arguments

training_args = TrainingArguments(
    output_dir="../src/models/roberta_checkpoints",
    num_train_epochs=1,                     # 1 epoch for demo
    per_device_train_batch_size=4,          # smaller batch for local CPU/GPU
    per_device_eval_batch_size=4,
    logging_dir="../src/models/roberta_logs",
    logging_steps=50,
    do_eval=True,
    eval_strategy="steps",                  # compatible with older versions
    save_strategy="no"                       # no repeated checkpoints
)

# Metrics

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}


# Trainer

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_small,
    eval_dataset=val_dataset_small,
    compute_metrics=compute_metrics
)

# Train

trainer.train()

# Save final model
model.save_pretrained("../src/models/roberta_model")
roberta_tokenizer.save_pretrained("../src/models/roberta_model")

print("✅ Lightweight DistilRoBERTa model trained and saved!")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Accuracy
50,0.7686,0.333651,0.874
100,0.2745,0.007171,1.0
150,0.0124,0.000701,1.0
200,0.001,0.000436,1.0
250,0.0006,0.00034,1.0
300,0.0006,0.000273,1.0
350,0.0005,0.000238,1.0
400,0.0004,0.00022,1.0
450,0.0004,0.000208,1.0
500,0.0004,0.000204,1.0


✅ Lightweight DistilRoBERTa model trained and saved!
