In [3]:
# Load Data

import pandas as pd
from sklearn.model_selection import train_test_split

# Load raw text files
with open("data/rt-polarity.neg", "r", encoding="utf-8", errors="ignore") as f:
    texts_neg = f.read().splitlines()
with open("data/rt-polarity.pos", "r", encoding="utf-8", errors="ignore") as f:
    texts_pos = f.read().splitlines()

# Combine texts and labels into a DataFrame
data = [(text, 0) for text in texts_neg] + [(text, 1) for text in texts_pos]
df = pd.DataFrame(data, columns=["text", "label"])

# Shuffle and split into train/test sets
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
)

In [4]:
# Define Independent Evaluation Data and Helper Function

from sklearn.metrics import classification_report, accuracy_score

# Independent sentences and their true labels
independent_eval_data = {
    "sentences": [
        "I can't believe I wasted two hours on this.",
        "Absolutely stunning visuals and sound.",
        "Nothing made sense from start to finish.",
        "I would gladly watch it again.",
        "The characters were flat and uninteresting.",
        "A truly heartwarming story.",
        "Regretted not walking out halfway.",
        "The pacing was perfect throughout.",
        "It felt like a school project.",
        "One of the best experiences I've had recently.",
        "I didn't expect much, and I was right.",
        "Everything came together beautifully.",
        "It lacked soul and depth.",
        "A breath of fresh air!",
        "I've seen better student films.",
        "Brilliantly acted and directed.",
        "It was just okay, nothing more.",
        "Truly inspiring and thoughtful.",
        "Worst decision to spend money on this.",
        "Delivers on every level.",
        "I nearly fell asleep watching it.",
        "Left me with tears of joy.",
        "Painfully dull and repetitive.",
        "I felt connected to every character.",
        "Should've trusted the negative reviews.",
        "Exceeded all expectations!",
        "The dialogue felt forced and fake.",
        "I'll remember this for years to come.",
        "Just another generic story.",
        "Packed with emotion and honesty.",
        "Everything was overacted.",
        "An instant favorite of mine.",
        "Confusing and messy throughout.",
        "The cinematography was breathtaking.",
        "Avoid at all costs.",
        "Finally, something original!",
        "I couldn’t finish it.",
        "Perfect from start to finish.",
        "So bad it’s almost good.",
        "Uplifting and magical.",
        "The twist was completely predictable.",
        "Left me speechless in the best way.",
        "Cliché after cliché.",
        "Creative and full of surprises.",
        "What a waste of potential.",
        "Full of charm and wit.",
        "Annoyingly loud soundtrack.",
        "Simply delightful!",
        "The plot went nowhere.",
        "A total joy to watch.",
        "I’ve never cringed so much.",
        "Soothing and beautiful tone.",
        "Disappointing in every way.",
        "Deserves an award.",
        "Awkward performances throughout.",
        "A rare gem these days.",
        "I expected more from this director.",
        "Left me smiling the whole time.",
        "Boring, dry, and uneventful.",
        "Smart, funny, and touching.",
        "Felt like a chore to sit through.",
        "Absolutely nailed it.",
        "Zero chemistry between the leads.",
        "Full of heart and meaning.",
        "Not even my dog enjoyed it.",
        "I didn’t want it to end.",
        "Cringe-worthy in all the wrong ways.",
        "Memorable and masterfully told.",
        "Felt rushed and incomplete.",
        "A celebration of storytelling.",
        "I'd rather do taxes than watch this again.",
        "Hilarious from beginning to end.",
        "Left me emotionally drained — in a good way.",
        "Might be the worst thing I've ever seen.",
        "This film restored my faith in cinema.",
        "Cheap effects and poor editing.",
        "A deeply moving experience.",
        "Embarrassingly bad.",
        "Couldn't stop smiling the whole time.",
        "They clearly didn’t care about quality.",
        "Highly recommended!",
        "Too many plot holes to count.",
        "Incredibly well written.",
        "Poorly thought-out mess.",
        "An emotional rollercoaster.",
        "Visually noisy and confusing.",
        "Soft, gentle, and powerful.",
        "Why did I even watch this?",
        "Unforgettable in the best sense.",
        "Terribly disappointing.",
        "It gave me chills.",
        "Felt more like a parody.",
        "Exactly what I needed today.",
        "Not as clever as it thinks it is.",
        "It spoke directly to my heart.",
        "Wish I had skipped it.",
        "A masterclass in subtlety.",
        "The acting was unbearable.",
        "Loved every second of it.",
        "Just plain bad.",
        "It’s rare to see something this sincere."
    ],
    "labels": [
        0, 1, 0, 1, 0,
        1, 0, 1, 0, 1,
        0, 1, 0, 1, 0,
        1, 0, 1, 0, 1,
        0, 1, 0, 1, 0,
        1, 0, 1, 0, 1,
        0, 1, 0, 1, 0,
        1, 0, 1, 0, 1,
        0, 1, 0, 1, 0,
        1, 0, 1, 0, 1,
        0, 1, 0, 1, 0,
        1, 0, 1, 0, 1,
        0, 1, 0, 1, 0,
        1, 0, 1, 1, 0,
        1, 0, 1, 0, 1,
        0, 1, 0, 1, 0,
        1, 0, 1, 0, 1,
        0, 1, 0, 1, 0,
        1, 0, 1, 0, 1,
        1, 0, 1, 0, 1,
        0
    ]
}

def evaluate_model_on_independent_data(model, model_name, vectorizer=None):
    """
    Evaluate a trained model on the independent evaluation set.
    If a vectorizer is provided, transform the sentences; otherwise, assume the model
    can handle raw text (e.g., BERT pipelines).
    Prints per-sentence predictions and a classification report.
    """
    print(f"\n--- Evaluation for: {model_name} ---")
    texts = independent_eval_data["sentences"]
    true_labels = independent_eval_data["labels"]

    if vectorizer is not None:
        features = vectorizer.transform(texts)
    else:
        features = texts  # model.predict should accept raw texts

    predictions = model.predict(features)

    print("\nPredictions on independent sentences:")
    for sentence, pred, true in zip(texts, predictions, true_labels):
        pred_label = "Positive" if pred == 1 else "Negative"
        true_label = "Positive" if true == 1 else "Negative"
        print(f"Pred: {pred_label:<8} | True: {true_label:<8} | {sentence}")

    print("\nClassification Report:")
    print(classification_report(true_labels, predictions))
    acc = accuracy_score(true_labels, predictions)
    print(f"Accuracy on independent test set: {acc:.2f}")

In [None]:
# TF-IDF + Logistic Regression

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Vectorize training and test texts
tfidf = TfidfVectorizer(min_df=3, max_features=10000)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

# Train Logistic Regression on TF-IDF vectors
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train_vec, y_train)

# Evaluate on hold-out test set
y_pred = clf_tfidf.predict(X_test_vec)
print("TF-IDF + Logistic Regression (hold-out test)")
print(classification_report(y_test, y_pred))

# Evaluation by sentences
evaluate_model_on_independent_data(
    model=clf_tfidf,
    model_name="TF-IDF + Logistic Regression",
    vectorizer=tfidf
)

TF-IDF + Logistic Regression (hold-out test)
              precision    recall  f1-score   support

           0       0.76      0.74      0.75      1098
           1       0.73      0.76      0.74      1035

    accuracy                           0.75      2133
   macro avg       0.75      0.75      0.75      2133
weighted avg       0.75      0.75      0.75      2133


--- Evaluation for: TF-IDF + Logistic Regression ---

Predictions on independent sentences:
Pred: Negative | True: Negative | I can't believe I wasted two hours on this.
Pred: Positive | True: Positive | Absolutely stunning visuals and sound.
Pred: Negative | True: Negative | Nothing made sense from start to finish.
Pred: Negative | True: Positive | I would gladly watch it again.
Pred: Negative | True: Negative | The characters were flat and uninteresting.
Pred: Positive | True: Positive | A truly heartwarming story.
Pred: Negative | True: Negative | Regretted not walking out halfway.
Pred: Negative | True: Positive | T

In [None]:
# Word2Vec + Logistic Regression

import nltk
nltk.download("punkt")
import gensim
import numpy as np
from nltk.tokenize import word_tokenize

# Tokenize training corpus for Word2Vec training
X_train_tok = [word_tokenize(text.lower()) for text in X_train]

# Train Word2Vec model on the tokenized training texts
w2v_model = gensim.models.Word2Vec(
    sentences=X_train_tok,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4
)

# Define a vectorizer class to compute average Word2Vec embeddings
class Word2VecVectorizer:
    def __init__(self, model, vector_size=100):
        self.model = model
        self.vector_size = vector_size

    def transform(self, texts):
        tokenized = [word_tokenize(text.lower()) for text in texts]
        return np.array([self.document_vector(tokens) for tokens in tokenized])

    def document_vector(self, tokens):
        words = [w for w in tokens if w in self.model.wv]
        if words:
            return np.mean([self.model.wv[w] for w in words], axis=0)
        else:
            return np.zeros(self.vector_size)

# Initialize vectorizer and transform train/test sets
w2v_vectorizer = Word2VecVectorizer(w2v_model)
X_train_w2v = w2v_vectorizer.transform(X_train)
X_test_w2v = w2v_vectorizer.transform(X_test)

# Train Logistic Regression on Word2Vec vectors
clf_w2v = LogisticRegression(max_iter=1000)
clf_w2v.fit(X_train_w2v, y_train)

# Evaluate on hold-out test set
y_pred_w2v = clf_w2v.predict(X_test_w2v)
print("\nWord2Vec + Logistic Regression (hold-out test)")
print(classification_report(y_test, y_pred_w2v))

# Evaluation by sentences
evaluate_model_on_independent_data(
    model=clf_w2v,
    model_name="Word2Vec + Logistic Regression",
    vectorizer=w2v_vectorizer
)

[nltk_data] Downloading package punkt to /Users/stacey_xd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Word2Vec + Logistic Regression (hold-out test)
              precision    recall  f1-score   support

           0       0.60      0.52      0.56      1098
           1       0.55      0.63      0.59      1035

    accuracy                           0.57      2133
   macro avg       0.58      0.58      0.57      2133
weighted avg       0.58      0.57      0.57      2133


--- Evaluation for: Word2Vec + Logistic Regression ---

Predictions on independent sentences:
Pred: Negative | True: Negative | I can't believe I wasted two hours on this.
Pred: Positive | True: Positive | Absolutely stunning visuals and sound.
Pred: Negative | True: Negative | Nothing made sense from start to finish.
Pred: Negative | True: Positive | I would gladly watch it again.
Pred: Positive | True: Negative | The characters were flat and uninteresting.
Pred: Positive | True: Positive | A truly heartwarming story.
Pred: Negative | True: Negative | Regretted not walking out halfway.
Pred: Negative | True: Positiv

In [None]:
# Word2Vec + XGBoost

from xgboost import XGBClassifier

# Transform train/test sets using existing Word2VecVectorizer
X_train_w2v = w2v_vectorizer.transform(X_train)
X_test_w2v = w2v_vectorizer.transform(X_test)

# Train XGBoost classifier
clf_xgb = XGBClassifier(eval_metric="logloss", random_state=42)
clf_xgb.fit(X_train_w2v, y_train)

# Evaluate on hold-out test set
y_pred_xgb = clf_xgb.predict(X_test_w2v)
print("\nWord2Vec + XGBoost (hold-out test)")
print(classification_report(y_test, y_pred_xgb))

# Evaluation by sentences
evaluate_model_on_independent_data(
    model=clf_xgb,
    model_name="Word2Vec + XGBoost",
    vectorizer=w2v_vectorizer
)


Word2Vec + XGBoost (hold-out test)
              precision    recall  f1-score   support

           0       0.57      0.55      0.56      1098
           1       0.54      0.57      0.55      1035

    accuracy                           0.56      2133
   macro avg       0.56      0.56      0.56      2133
weighted avg       0.56      0.56      0.56      2133


--- Evaluation for: Word2Vec + XGBoost ---

Predictions on independent sentences:
Pred: Negative | True: Negative | I can't believe I wasted two hours on this.
Pred: Positive | True: Positive | Absolutely stunning visuals and sound.
Pred: Negative | True: Negative | Nothing made sense from start to finish.
Pred: Negative | True: Positive | I would gladly watch it again.
Pred: Negative | True: Negative | The characters were flat and uninteresting.
Pred: Positive | True: Positive | A truly heartwarming story.
Pred: Positive | True: Negative | Regretted not walking out halfway.
Pred: Negative | True: Positive | The pacing was perfe

In [None]:
# Word2Vec + Random Forest

from sklearn.ensemble import RandomForestClassifier

# Transform train/test sets using existing Word2VecVectorizer
X_train_w2v = w2v_vectorizer.transform(X_train)
X_test_w2v = w2v_vectorizer.transform(X_test)

# Train Random Forest classifier
clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf_rf.fit(X_train_w2v, y_train)

# Evaluate on hold-out test set
y_pred_rf = clf_rf.predict(X_test_w2v)
print("\nWord2Vec + Random Forest (hold-out test)")
print(classification_report(y_test, y_pred_rf))

# Evaluation by sentences
evaluate_model_on_independent_data(
    model=clf_rf,
    model_name="Word2Vec + Random Forest",
    vectorizer=w2v_vectorizer
)


Word2Vec + Random Forest (hold-out test)
              precision    recall  f1-score   support

           0       0.59      0.56      0.58      1098
           1       0.56      0.59      0.57      1035

    accuracy                           0.58      2133
   macro avg       0.58      0.58      0.58      2133
weighted avg       0.58      0.58      0.58      2133


--- Evaluation for: Word2Vec + Random Forest ---

Predictions on independent sentences:
Pred: Negative | True: Negative | I can't believe I wasted two hours on this.
Pred: Negative | True: Positive | Absolutely stunning visuals and sound.
Pred: Negative | True: Negative | Nothing made sense from start to finish.
Pred: Negative | True: Positive | I would gladly watch it again.
Pred: Positive | True: Negative | The characters were flat and uninteresting.
Pred: Positive | True: Positive | A truly heartwarming story.
Pred: Negative | True: Negative | Regretted not walking out halfway.
Pred: Negative | True: Positive | The paci

In [5]:
# DistilBERT

from transformers import pipeline

# Load pre-trained pipeline
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Predict a subset for performance reasons
X_test_sample = X_test.iloc[:1000]
y_test_sample = y_test.iloc[:1000]

preds = [1 if r['label'] == 'POSITIVE' else 0 for r in classifier(X_test_sample.tolist(), truncation=True)]
print("DistilBERT")
print(classification_report(y_test_sample, preds))

Device set to use mps:0


DistilBERT
              precision    recall  f1-score   support

           0       0.91      0.88      0.89       534
           1       0.87      0.89      0.88       466

    accuracy                           0.89      1000
   macro avg       0.89      0.89      0.89      1000
weighted avg       0.89      0.89      0.89      1000



In [None]:
# DistilBERT Evaluation on Independent Data

from transformers import pipeline

# Initialize pre-trained DistilBERT sentiment-analysis pipeline
distilbert_pipeline = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

# Wrap the pipeline so it supports .predict(texts)
class DistilBERTWrapper:
    def __init__(self, hf_pipeline):
        self.pipeline = hf_pipeline

    def predict(self, sentences):
        results = self.pipeline(sentences, truncation=True)
        return [1 if r["label"] == "POSITIVE" else 0 for r in results]

clf_distilbert = DistilBERTWrapper(distilbert_pipeline)

# Evaluation by sentences
evaluate_model_on_independent_data(
    model=clf_distilbert,
    model_name="DistilBERT",
    vectorizer=None
)

Device set to use mps:0



--- Evaluation for: DistilBERT ---

Predictions on independent sentences:
Pred: Negative | True: Negative | I can't believe I wasted two hours on this.
Pred: Positive | True: Positive | Absolutely stunning visuals and sound.
Pred: Negative | True: Negative | Nothing made sense from start to finish.
Pred: Positive | True: Positive | I would gladly watch it again.
Pred: Negative | True: Negative | The characters were flat and uninteresting.
Pred: Positive | True: Positive | A truly heartwarming story.
Pred: Negative | True: Negative | Regretted not walking out halfway.
Pred: Positive | True: Positive | The pacing was perfect throughout.
Pred: Negative | True: Negative | It felt like a school project.
Pred: Positive | True: Positive | One of the best experiences I've had recently.
Pred: Positive | True: Negative | I didn't expect much, and I was right.
Pred: Positive | True: Positive | Everything came together beautifully.
Pred: Negative | True: Negative | It lacked soul and depth.
Pred:

In [None]:
# BERT Training and Evaluation

import torch
from transformers import (
    AutoTokenizer,
    BertForSequenceClassification,
    DataCollatorWithPadding,
    get_scheduler
)
from torch.utils.data import DataLoader
from datasets import Dataset
from sklearn.metrics import classification_report
from tqdm import tqdm
import random

# Combine and shuffle texts and labels
texts = texts_pos + texts_neg  # recall texts_pos (label=1) and texts_neg (label=0)
labels = [1] * len(texts_pos) + [0] * len(texts_neg)
combined = list(zip(texts, labels))
random.shuffle(combined)
texts, labels = zip(*combined)

# Create HuggingFace dataset and split into train/test
dataset = Dataset.from_dict({"text": list(texts), "label": list(labels)})
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2
)

# Tokenization function
def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True, padding=True)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_fn, batched=True)
test_dataset = test_dataset.map(tokenize_fn, batched=True)

# Remove the original text column
train_dataset = train_dataset.remove_columns("text")
test_dataset = test_dataset.remove_columns("text")

# Prepare DataLoaders
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
train_loader = DataLoader(
    train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator
)
test_loader = DataLoader(
    test_dataset, batch_size=8, shuffle=False, collate_fn=data_collator
)

# Set device (MPS or CPU)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Running on:", device)
model.to(device)

# Set up optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 4
num_training_steps = len(train_loader) * num_epochs
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# Training loop
model.train()
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}")
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    # Evaluation after each epoch
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            labels_batch = batch["labels"]
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, axis=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels_batch.numpy())

    print(f"\nClassification report after epoch {epoch + 1}:")
    print(classification_report(all_labels, all_preds))
    model.train()

# Save the best model from the final epoch
model.save_pretrained("bert_sentiment")
tokenizer.save_pretrained("bert_sentiment")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8529 [00:00<?, ? examples/s]

Map:   0%|          | 0/2133 [00:00<?, ? examples/s]

Running on: mps

Epoch 1


100%|██████████| 1067/1067 [08:03<00:00,  2.21it/s]



Classification report after epoch 1:
              precision    recall  f1-score   support

           0       0.90      0.80      0.85      1051
           1       0.83      0.91      0.87      1082

    accuracy                           0.86      2133
   macro avg       0.86      0.86      0.86      2133
weighted avg       0.86      0.86      0.86      2133


Epoch 2


100%|██████████| 1067/1067 [07:54<00:00,  2.25it/s]



Classification report after epoch 2:
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      1051
           1       0.87      0.89      0.88      1082

    accuracy                           0.88      2133
   macro avg       0.88      0.88      0.88      2133
weighted avg       0.88      0.88      0.88      2133


Epoch 3


100%|██████████| 1067/1067 [07:54<00:00,  2.25it/s]



Classification report after epoch 3:
              precision    recall  f1-score   support

           0       0.88      0.89      0.88      1051
           1       0.89      0.88      0.88      1082

    accuracy                           0.88      2133
   macro avg       0.88      0.88      0.88      2133
weighted avg       0.88      0.88      0.88      2133


Epoch 4


100%|██████████| 1067/1067 [07:53<00:00,  2.26it/s]



Classification report after epoch 4:
              precision    recall  f1-score   support

           0       0.88      0.87      0.88      1051
           1       0.87      0.89      0.88      1082

    accuracy                           0.88      2133
   macro avg       0.88      0.88      0.88      2133
weighted avg       0.88      0.88      0.88      2133



('bert_sentiment/tokenizer_config.json',
 'bert_sentiment/special_tokens_map.json',
 'bert_sentiment/vocab.txt',
 'bert_sentiment/added_tokens.json',
 'bert_sentiment/tokenizer.json')

In [None]:
# Load Saved BERT Model and Evaluate on Independent Sentences

from transformers import AutoTokenizer, BertForSequenceClassification
import torch
from sklearn.metrics import classification_report, accuracy_score

# Path to the locally saved BERT model folder
model_path = "./bert_sentiment"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# Set device and switch to evaluation mode
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)
model.eval()

# Tokenize independent sentences in a batch
texts = independent_eval_data["sentences"]
true_labels = independent_eval_data["labels"]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, axis=1).cpu().numpy()

# Print per-sentence prediction vs true label
print("\n--- Evaluation for: BERT ---")
for sent, pred, true in zip(texts, predictions, true_labels):
    pred_label = "Positive" if pred == 1 else "Negative"
    true_label = "Positive" if true == 1 else "Negative"
    print(f"Pred: {pred_label:<8} | True: {true_label:<8} | {sent}")

# Print classification report and accuracy
print("\nClassification Report:")
print(classification_report(true_labels, predictions))
print(f"Accuracy on independent test set: {accuracy_score(true_labels, predictions):.2f}")


--- Evaluation for: BERT ---
Pred: Negative | True: Negative | I can't believe I wasted two hours on this.
Pred: Positive | True: Positive | Absolutely stunning visuals and sound.
Pred: Negative | True: Negative | Nothing made sense from start to finish.
Pred: Positive | True: Positive | I would gladly watch it again.
Pred: Negative | True: Negative | The characters were flat and uninteresting.
Pred: Positive | True: Positive | A truly heartwarming story.
Pred: Positive | True: Negative | Regretted not walking out halfway.
Pred: Positive | True: Positive | The pacing was perfect throughout.
Pred: Negative | True: Negative | It felt like a school project.
Pred: Positive | True: Positive | One of the best experiences I've had recently.
Pred: Positive | True: Negative | I didn't expect much, and I was right.
Pred: Positive | True: Positive | Everything came together beautifully.
Pred: Negative | True: Negative | It lacked soul and depth.
Pred: Positive | True: Positive | A breath of fres

In [11]:
from tabulate import tabulate

data_holdout = [
    ["TF-IDF + Logistic Regression",       0.75, 0.75, 0.75, 0.75],
    ["Word2Vec + Logistic Regression",     0.60, 0.57, 0.59, 0.57],
    ["Word2Vec + XGBoost",                 0.56, 0.56, 0.56, 0.56],
    ["Word2Vec + Random Forest",           0.58, 0.58, 0.58, 0.58],
    ["DistilBERT",                         0.89, 0.89, 0.89, 0.89],
    ["BERT (epoch 4)",                     0.88, 0.88, 0.88, 0.88],
]

headers = ["Model", "Precision", "Recall", "F1-score", "Accuracy"]
print("Model quality metrics on the test dataset:")
print(tabulate(data_holdout, headers=headers, tablefmt="grid"))

Model quality metrics on the test dataset:
+--------------------------------+-------------+----------+------------+------------+
| Model                          |   Precision |   Recall |   F1-score |   Accuracy |
| TF-IDF + Logistic Regression   |        0.75 |     0.75 |       0.75 |       0.75 |
+--------------------------------+-------------+----------+------------+------------+
| Word2Vec + Logistic Regression |        0.6  |     0.57 |       0.59 |       0.57 |
+--------------------------------+-------------+----------+------------+------------+
| Word2Vec + XGBoost             |        0.56 |     0.56 |       0.56 |       0.56 |
+--------------------------------+-------------+----------+------------+------------+
| Word2Vec + Random Forest       |        0.58 |     0.58 |       0.58 |       0.58 |
+--------------------------------+-------------+----------+------------+------------+
| DistilBERT                     |        0.89 |     0.89 |       0.89 |       0.89 |
+----------

In [14]:
from tabulate import tabulate

data_holdout = [
    ["TF-IDF + Logistic Regression",   0.72, 0.71, 0.71, 0.71],
    ["Word2Vec + Logistic Regression", 0.59, 0.58, 0.58, 0.58],
    ["Word2Vec + XGBoost",             0.59, 0.58, 0.58, 0.58],
    ["Word2Vec + Random Forest",       0.59, 0.56, 0.53, 0.56],
    ["DistilBERT",                     0.85, 0.85, 0.85, 0.85],
    ["BERT",                           0.87, 0.87, 0.87, 0.87],
]

headers = ["Model", "Precision", "Recall", "F1-score", "Accuracy"]
print("Model quality metrics on an independent dataset:")
print(tabulate(data_holdout, headers=headers, tablefmt="grid"))

Model quality metrics on an independent dataset:
+--------------------------------+-------------+----------+------------+------------+
| Model                          |   Precision |   Recall |   F1-score |   Accuracy |
| TF-IDF + Logistic Regression   |        0.72 |     0.71 |       0.71 |       0.71 |
+--------------------------------+-------------+----------+------------+------------+
| Word2Vec + Logistic Regression |        0.59 |     0.58 |       0.58 |       0.58 |
+--------------------------------+-------------+----------+------------+------------+
| Word2Vec + XGBoost             |        0.59 |     0.58 |       0.58 |       0.58 |
+--------------------------------+-------------+----------+------------+------------+
| Word2Vec + Random Forest       |        0.59 |     0.56 |       0.53 |       0.56 |
+--------------------------------+-------------+----------+------------+------------+
| DistilBERT                     |        0.85 |     0.85 |       0.85 |       0.85 |
+----

### Model Performance Analysis

**Top Performers: BERT & DistilBERT (Accuracy: ~0.85-0.89)**  
These Transformer-based models demonstrated superior performance by a large margin. Their key advantage is **contextual understanding**, allowing them to interpret a word's meaning based on the entire sentence. This leads to a more nuanced and accurate classification. Additionally, the BERT model was trained over four epochs, with the final epoch yielding the best results.

**Strong Baseline: TF-IDF + Logistic Regression (Accuracy: ~0.71-0.75)**  
This classical approach performed reasonably well. Its strength lies in identifying **important keywords** that are highly indicative of a specific class. While it ignores context and grammar, it proved to be an effective baseline for this task.

**Underperformers: Word2Vec-based Models (Accuracy: ~0.56-0.59)**  
All models using Word2Vec embeddings showed poor results, barely surpassing a random guess. The primary reason is the **loss of information** when averaging non-contextual word vectors to create a single document representation. This method dilutes the impact of key predictive words, providing a weak signal to the classifier.

**Conclusion on Datasets**  
The slight performance decrease on the **independent dataset** compared to the test set is expected. It provides a more realistic measure of each model's ability to generalize to new, unseen data. The overall ranking and performance gap between model types remained consistent, confirming the superiority of the Transformer architecture for this problem.
