In [1]:
!pip install gensim
!pip install torch torchvision
!pip install scikit-learn
!pip install nltk

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

from gensim.models import Word2Vec
import gensim.downloader as api

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<torch._C.Generator at 0x7bf776fc6490>

## 1) Dataset Generation

- Load the HW1 Amazon CSV (must have columns `reviewText` and `overall`).
- Build a balanced dataset: *PER_RATING_COUNT* samples for each rating 1..5.
- Map ratings to ternary labels: >3 → 1 (pos), <3 → 2 (neg), ==3 → 3 (neutral).
- Save to cache to avoid re-computation.

In [33]:
import pandas as pd

file_path = "amazon_reviews_us_Office_Products_v1_00.tsv"

df = pd.read_csv(
    file_path,
    sep='\t',
    engine='python',       # FIXES your parser error
    on_bad_lines='skip',   # skips malformed rows

)

print("Loaded shape:", df.shape)
print(df.columns)
df = df[['star_rating', 'review_body']]

df['star_rating'] = df['star_rating'].astype(str)

print(df.head())
print(df['star_rating'].value_counts())


Loaded shape: (2637159, 15)
Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='object')
  star_rating                                        review_body
0           5                                     Great product.
1           5  What's to say about this commodity item except...
2           5    Haven't used yet, but I am sure I will like it.
3           1  Although this was labeled as &#34;new&#34; the...
4           4                    Gorgeous colors and easy to use
star_rating
5    1580941
4     417975
1     306576
3     193452
2     138215
Name: count, dtype: int64


In [34]:
samples_per_rating = 50000
random_state = 42

balanced_dfs = []

for rating in ['1', '2', '3', '4', '5']:
    subset = df[df['star_rating'] == rating]

    sampled_subset = subset.sample(
        n=samples_per_rating,
        random_state=random_state
    )

    balanced_dfs.append(sampled_subset)

balanced_df = pd.concat(balanced_dfs)

# Shuffle final dataset
balanced_df = balanced_df.sample(
    frac=1,
    random_state=random_state
).reset_index(drop=True)

print("Balanced shape:", balanced_df.shape)
print(balanced_df['star_rating'].value_counts())


Balanced shape: (250000, 2)
star_rating
1    50000
2    50000
3    50000
4    50000
5    50000
Name: count, dtype: int64


In [35]:
balanced_df.to_csv("balanced_250k.csv", index=False)


In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import os

# CONFIG
BALANCED_CSV = "balanced_250k.csv"
OUTPUT_DIR = "processed_data"
RANDOM_STATE = 42
TEST_SIZE = 0.20

os.makedirs(OUTPUT_DIR, exist_ok=True)

balanced_df = pd.read_csv(BALANCED_CSV)

def rating_to_sentiment(r):
    r = float(r)
    if r > 3:
        return 1  # positive
    elif r < 3:
        return 2  # negative
    else:
        return 3  # neutral

balanced_df['sentiment'] = balanced_df['star_rating'].apply(rating_to_sentiment)

print("Star rating distribution:")
print(balanced_df['star_rating'].value_counts().sort_index())

print("\nSentiment distribution:")
print(balanced_df['sentiment'].value_counts().sort_index())

train_df, test_df = train_test_split(
    balanced_df,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=balanced_df['sentiment']
)

print(f"\nTrain size: {len(train_df)}")
print(f"Test size: {len(test_df)}")


train_df.to_csv(os.path.join(OUTPUT_DIR, "train_80pct.csv"), index=False)
test_df.to_csv(os.path.join(OUTPUT_DIR, "test_20pct.csv"), index=False)


torch.save({
    'train_reviews': train_df['review_body'].astype(str).tolist(),
    'train_sentiments': torch.tensor(train_df['sentiment'].values, dtype=torch.long),
    'test_reviews': test_df['review_body'].astype(str).tolist(),
    'test_sentiments': torch.tensor(test_df['sentiment'].values, dtype=torch.long),
}, os.path.join(OUTPUT_DIR, "balanced_250k_torch.pt"))

print("Saved everything successfully.")


Star rating distribution:
star_rating
1    50000
2    50000
3    50000
4    50000
5    50000
Name: count, dtype: int64

Sentiment distribution:
sentiment
1    100000
2    100000
3     50000
Name: count, dtype: int64

Train size: 200000
Test size: 50000
Saved everything successfully.


## 2. Word Embedding
-Load the pretrained “word2vec-google-news-300”
-Train a Word2Vec model using your own dataset


In [42]:
import gensim.downloader as api

w2v_pretrained = api.load("word2vec-google-news-300")

print("Loaded successfully")

Loaded successfully


In [43]:
w2v_pretrained.most_similar(
    positive=['king','woman'],
    negative=['man']
)


[('queen', 0.7118193507194519),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321839332581),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.518113374710083),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411403656006)]

In [44]:
w2v_pretrained.similarity('excellent','outstanding')

np.float32(0.55674857)

In [45]:
class ReviewSentenceIterator:
    def __init__(self, csv_path, text_col='review_body'):
        self.csv_path = csv_path
        self.text_col = text_col

    def __iter__(self):
        for chunk in pd.read_csv(self.csv_path, usecols=[self.text_col], chunksize=5000, dtype=str):
            for text in chunk[self.text_col].astype(str).values:
                yield simple_preprocess(text, deacc=True)


In [46]:
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec, KeyedVectors

TRAIN_CSV = "processed_data/train_80pct.csv"
MODEL_OUT_DIR = "models"
os.makedirs(MODEL_OUT_DIR, exist_ok=True)
MODEL_PATH = os.path.join(MODEL_OUT_DIR, "w2v_amazon_office_products_300_window11_min10_sg1.bin")

VECTOR_SIZE = 300
WINDOW = 11
MIN_COUNT = 10
SEED = 42
WORKERS = 4
SG = 1

sentences = ReviewSentenceIterator(TRAIN_CSV, text_col='review_body')

print("Building vocabulary...")
model = Word2Vec(
    vector_size=VECTOR_SIZE,
    window=WINDOW,
    min_count=MIN_COUNT,
    workers=WORKERS,
    seed=SEED,
    sg=SG
)

model.build_vocab(sentences)
print(f"Vocab size after build_vocab: {len(model.wv.key_to_index)}")

print("Training model (this can take a while)...")
model.train(
    ReviewSentenceIterator(TRAIN_CSV, text_col='review_body'),
    total_examples=model.corpus_count,
    epochs=5
)

model.save(MODEL_PATH)
model.wv.save_word2vec_format(MODEL_PATH + ".kv.bin", binary=True)
print("Saved model to:", MODEL_PATH)


Building vocabulary...
Vocab size after build_vocab: 15017
Training model (this can take a while)...
Saved model to: models/w2v_amazon_office_products_300_window11_min10_sg1.bin


In [47]:

w2v_own = Word2Vec.load(MODEL_PATH)

# analogy: king - man + woman
try:
    results = w2v_own.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=10)
    print("Analogy (own model) king - man + woman -> top results:")
    for w, score in results[:6]:
        print(f"  {w:15s} {score:.4f}")
except KeyError as e:
    print("Analogy failed (word not in vocab):", e)

# similarity: excellent vs outstanding
try:
    sim_own = w2v_own.wv.similarity('excellent', 'outstanding')
    print(f"\nSimilarity(own model) excellent vs outstanding = {sim_own:.6f}")
except KeyError as e:
    print("Similarity failed (word not in vocab):", e)


Analogy (own model) king - man + woman -> top results:
  magnum          0.5426
  chisel          0.3962
  gamers          0.3940
  urban           0.3924
  bros            0.3888
  rb              0.3885

Similarity(own model) excellent vs outstanding = 0.722054


## 3) Simple models

In [49]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from gensim.utils import simple_preprocess

# Load data
train_df = pd.read_csv("processed_data/train_80pct.csv")
test_df = pd.read_csv("processed_data/test_20pct.csv")

# Keep only class 1 and 2 (binary)
train_df = train_df[train_df['sentiment'].isin([1,2])]
test_df = test_df[test_df['sentiment'].isin([1,2])]

print("Train shape (binary):", train_df.shape)
print("Test shape (binary):", test_df.shape)


Train shape (binary): (160000, 3)
Test shape (binary): (40000, 3)


In [55]:
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess

# downloads (run once; safe to call repeatedly)
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

# setup
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# text cleaning
def clean_review(text):
    text = str(text).lower()                       # lowercase and ensure string
    text = re.sub(r"<.*?>", " ", text)             # remove HTML tags
    text = re.sub(r"http\S+|www\S+", " ", text)    # remove URLs
    text = re.sub(r"[^a-z\s]", " ", text)          # keep letters and spaces only
    text = re.sub(r"\s+", " ", text).strip()       # collapse & trim whitespace
    return text

# lemmatize plain whitespace-tokenized string
def lemmatize_review(text):
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(tokens)

# remove stopwords from plain whitespace-tokenized string
def remove_stopwords(text):
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    return " ".join(tokens)

# unified preprocess: clean -> lemmatize -> remove stopwords -> gensim tokenize
def preprocess(text):
    text = clean_review(text)
    text = lemmatize_review(text)
    text = remove_stopwords(text)
    # final tokenization with gensim's simple_preprocess (deacc removes accents/punctuation)
    return simple_preprocess(text, deacc=True)

# safe vector lookup & averaging (works if 'model' is Word2Vec or KeyedVectors)
def review_to_avg_vector(review, model):
    # get kv = model.wv if necessary (works both when model is Word2Vec or KeyedVectors)
    kv = getattr(model, "wv", model)

    tokens = preprocess(review)
    vectors = []

    for word in tokens:
        if word in kv.key_to_index:
            vectors.append(kv[word])

    # return zero vector if none in vocab
    if len(vectors) == 0:
        # try to determine vector_size safely
        vector_size = getattr(kv, "vector_size", None)
        if vector_size is None:
            # fallback for older gensim APIs
            vector_size = getattr(model, "vector_size", 300)
        return np.zeros(vector_size, dtype=float)

    return np.mean(vectors, axis=0)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [56]:
# X_train and X_test using pretrained model
X_train_pre = np.array([
    review_to_avg_vector(text, w2v_pretrained)
    for text in train_df['review_body']
])

X_test_pre = np.array([
    review_to_avg_vector(text, w2v_pretrained)
    for text in test_df['review_body']
])

y_train = train_df['sentiment'].values
y_test = test_df['sentiment'].values


In [57]:
X_train_own = np.array([
    review_to_avg_vector(text, w2v_own.wv)
    for text in train_df['review_body']
])

X_test_own = np.array([
    review_to_avg_vector(text, w2v_own.wv)
    for text in test_df['review_body']
])


In [58]:
# Perceptron
perc_pre = Perceptron(random_state=42)
perc_pre.fit(X_train_pre, y_train)
y_pred = perc_pre.predict(X_test_pre)
acc_perc_pre = accuracy_score(y_test, y_pred)

# SVM
svm_pre = LinearSVC(random_state=42)
svm_pre.fit(X_train_pre, y_train)
y_pred = svm_pre.predict(X_test_pre)
acc_svm_pre = accuracy_score(y_test, y_pred)

print("Pretrained - Perceptron Accuracy:", acc_perc_pre)
print("Pretrained - SVM Accuracy:", acc_svm_pre)


Pretrained - Perceptron Accuracy: 0.7858
Pretrained - SVM Accuracy: 0.817275


In [59]:
# Perceptron
perc_own = Perceptron(random_state=42)
perc_own.fit(X_train_own, y_train)
y_pred = perc_own.predict(X_test_own)
acc_perc_own = accuracy_score(y_test, y_pred)

# SVM
svm_own = LinearSVC(random_state=42)
svm_own.fit(X_train_own, y_train)
y_pred = svm_own.predict(X_test_own)
acc_svm_own = accuracy_score(y_test, y_pred)

print("Own W2V - Perceptron Accuracy:", acc_perc_own)
print("Own W2V - SVM Accuracy:", acc_svm_own)


Own W2V - Perceptron Accuracy: 0.731125
Own W2V - SVM Accuracy: 0.855025


4. Feedforward Neural Networks

In [88]:

TRAIN_CSV = "processed_data/train_80pct.csv"
TEST_CSV  = "processed_data/test_20pct.csv"
W2V_PRETRAINED_PATH = None   # if you have local preloaded key-vector file (optional)
W2V_OWN_PATH = "models/w2v_amazon_office_products_300_window11_min10_sg1.bin"  # as per your earlier code

# Vector size used when training your own model (must match)
VECTOR_SIZE = 300

train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)
print("Train shape:", train_df.shape, "Test shape:", test_df.shape)
print("Sentiment unique values:", sorted(train_df['sentiment'].unique()))

Train shape: (200000, 3) Test shape: (50000, 3)
Sentiment unique values: [np.int64(1), np.int64(2), np.int64(3)]


In [89]:

from gensim.models import KeyedVectors, Word2Vec

USE_PRETRAINED = True    # set False to skip Google-News pretrained (big download)
USE_OWN = True           # set True to load your own trained model file

w2v_pretrained = None
w2v_own = None

if USE_PRETRAINED:
    try:
        # if you previously downloaded 'word2vec-google-news-300' via gensim api, you can use it here
        import gensim.downloader as api
        print("Loading pretrained GoogleNews (this is large; skip if not available)...")
        w2v_pretrained = api.load("word2vec-google-news-300")
        print("Loaded GoogleNews model.")
    except Exception as e:
        print("Could not load pretrained via gensim API (skip or load manually).", e)
        w2v_pretrained = None

if USE_OWN:
    try:
        print("Loading your own Word2Vec model from:", W2V_OWN_PATH)
        w2v_own = Word2Vec.load(W2V_OWN_PATH)
        print("Loaded own model; vocab size:", len(w2v_own.wv.key_to_index))
    except Exception as e:
        print("Couldn't load own model:", e)
        w2v_own = None

# Choose the embedding to use later by name: 'pretrained' | 'own'

Loading pretrained GoogleNews (this is large; skip if not available)...
Loaded GoogleNews model.
Loading your own Word2Vec model from: models/w2v_amazon_office_products_300_window11_min10_sg1.bin
Loaded own model; vocab size: 15017


In [90]:

import re
from gensim.utils import simple_preprocess
import nltk
nltk.download("stopwords", quiet=True)

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize(text):
    text = clean_text(text)
    return simple_preprocess(text, deacc=True)  # list of tokens

In [91]:

def avg_w2v_vector(tokens, kv, vector_size=VECTOR_SIZE):
    """
    Returns average vector (numpy array). If no tokens in vocab -> zero vector.
    kv may be a KeyedVectors or Word2Vec.wv object.
    """
    if kv is None:
        return np.zeros(vector_size, dtype=np.float32)
    vecs = []
    for t in tokens:
        if t in kv.key_to_index:
            vecs.append(kv[t])
    if len(vecs) == 0:
        return np.zeros(vector_size, dtype=np.float32)
    return np.mean(vecs, axis=0).astype(np.float32)

def concat_first_k_vectors(tokens, kv, k=10, vector_size=VECTOR_SIZE):
    """
    Concatenate first k token vectors. If token missing -> zero vector.
    If fewer than k tokens -> pad with zero vectors.
    Returns numpy array length k * vector_size.
    """
    vecs = []
    for i in range(k):
        if i < len(tokens):
            t = tokens[i]
            if kv is not None and t in kv.key_to_index:
                vecs.append(kv[t])
            else:
                vecs.append(np.zeros(vector_size, dtype=np.float32))
        else:
            vecs.append(np.zeros(vector_size, dtype=np.float32))
    return np.concatenate(vecs).astype(np.float32)

In [92]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
class ReviewsDataset(Dataset):
    def __init__(self, reviews, labels, features):
        """
        reviews: list of strings (optional, kept for debug)
        labels: numpy array or list of ints
        features: numpy array shape (N, D)
        """
        assert len(labels) == len(features)
        self.reviews = reviews
        self.X = features.astype(np.float32)
        self.y = np.array(labels, dtype=np.int64)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [93]:

class MLP(nn.Module):
    def __init__(self, input_dim, hidden1=50, hidden2=10, n_classes=2, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden1),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden1, hidden2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden2, n_classes)
        )

    def forward(self, x):
        return self.net(x)

In [94]:

def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for X_batch, y_batch in dataloader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        optimizer.zero_grad()
        logits = model(X_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * X_batch.size(0)
    return total_loss / len(dataloader.dataset)

def eval_model(model, dataloader):
    model.eval()
    preds = []
    trues = []
    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch = X_batch.to(device)
            out = model(X_batch)
            pred = torch.argmax(out, dim=1).cpu().numpy()
            preds.extend(pred.tolist())
            trues.extend(y_batch.numpy().tolist())
    return np.array(preds), np.array(trues)

def run_training(X_train, y_train, X_val, y_val, n_classes=2,
                 hidden1=50, hidden2=10, lr=1e-3, weight_decay=1e-5,
                 batch_size=256, epochs=10, patience=None):
    # scale features (fit on train)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    train_ds = ReviewsDataset(None, y_train, X_train_scaled)
    val_ds   = ReviewsDataset(None, y_val, X_val_scaled)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
    val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=2)

    model = MLP(input_dim=X_train.shape[1], hidden1=hidden1, hidden2=hidden2, n_classes=n_classes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    best_val_acc = 0.0
    best_state = None

    for epoch in range(1, epochs+1):
        train_loss = train_epoch(model, train_loader, optimizer, criterion)
        preds_val, trues_val = eval_model(model, val_loader)
        val_acc = accuracy_score(trues_val, preds_val)

        print(f"Epoch {epoch}/{epochs} - train_loss: {train_loss:.4f} - val_acc: {val_acc:.4f}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_state = model.state_dict()

    # restore best
    if best_state is not None:
        model.load_state_dict(best_state)

    return model, scaler

In [95]:

def build_features_for_df(df, embed_source='pretrained', mode='avg', k=10, kv_pretrained=None, kv_own=None):
    """
    embed_source: 'pretrained' | 'own'
    mode: 'avg' | 'concat'
    k: number of tokens to concat for 'concat' mode
    """
    if embed_source == 'pretrained':
        kv = kv_pretrained
    elif embed_source == 'own':
        kv = kv_own.wv if hasattr(kv_own, 'wv') else kv_own
    else:
        kv = None

    reviews = df['review_body'].astype(str).tolist()
    tokens_list = [tokenize(r) for r in reviews]

    features = []
    for tokens in tqdm(tokens_list, desc=f"Building features mode={mode} src={embed_source}"):
        if mode == 'avg':
            vec = avg_w2v_vector(tokens, kv, vector_size=VECTOR_SIZE)
        elif mode == 'concat':
            vec = concat_first_k_vectors(tokens, kv, k=k, vector_size=VECTOR_SIZE)
        else:
            raise ValueError("mode must be 'avg' or 'concat'")
        features.append(vec)
    features = np.vstack(features)
    return features

In [96]:

import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
def run_experiments(embed_source='pretrained', mode='avg', k=10,
                    hidden1=50, hidden2=10, epochs=12, batch_size=256):
    """
    embed_source: 'pretrained' or 'own'
    mode: 'avg' or 'concat'
    """
    # Prepare dataframes for binary and ternary
    # Binary: sentiments 1 (positive label=1) and 2 (negative label=0) -- NOTE: original mapping uses 1=positive,2=negative
    # We'll map them to 0/1 for PyTorch labels.
    # Ternary: 1,2,3 -> map to 0,1,2

    # --- Binary dataset (keep only sentiment==1 or 2)
    train_bin = train_df[train_df['sentiment'].isin([1,2])].reset_index(drop=True)
    test_bin  = test_df[test_df['sentiment'].isin([1,2])].reset_index(drop=True)
    print("Binary sizes -> train:", len(train_bin), "test:", len(test_bin))

    # Build features
    feat_train_bin = build_features_for_df(train_bin, embed_source=embed_source, mode=mode, k=k,
                                          kv_pretrained=w2v_pretrained, kv_own=w2v_own)
    feat_test_bin  = build_features_for_df(test_bin,  embed_source=embed_source, mode=mode, k=k,
                                          kv_pretrained=w2v_pretrained, kv_own=w2v_own)

    # labels: map 1->1, 2->0 (or you can map 1->0,2->1 depending on preference)
    y_train_bin = (train_bin['sentiment'].astype(int).values == 1).astype(int)   # 1 -> True -> 1
    y_test_bin  = (test_bin['sentiment'].astype(int).values == 1).astype(int)

    # Train/val split inside train set (small val for checkpointing)
    from sklearn.model_selection import train_test_split
    X_tr_bin, X_val_bin, y_tr_bin, y_val_bin = train_test_split(
        feat_train_bin, y_train_bin, test_size=0.1, random_state=SEED, stratify=y_train_bin
    )

    model_bin, scaler_bin = run_training(
        X_tr_bin, y_tr_bin, X_val_bin, y_val_bin,
        n_classes=2, hidden1=hidden1, hidden2=hidden2,
        lr=1e-3, weight_decay=1e-5, batch_size=batch_size, epochs=epochs
    )

    # Evaluate on test
    X_test_bin_scaled = scaler_bin.transform(feat_test_bin)
    test_ds = ReviewsDataset(None, y_test_bin, X_test_bin_scaled)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=2)
    preds_bin, trues_bin = eval_model(model_bin, test_loader)
    acc_bin = accuracy_score(trues_bin, preds_bin)
    print("\n=== BINARY RESULTS ===")
    print("Test Accuracy:", acc_bin)
    print("Classification report:\n", classification_report(trues_bin, preds_bin, digits=4))
    print("Confusion matrix:\n", confusion_matrix(trues_bin, preds_bin))

    # --- Ternary dataset
    train_ter = train_df[train_df['sentiment'].isin([1,2,3])].reset_index(drop=True)
    test_ter  = test_df[test_df['sentiment'].isin([1,2,3])].reset_index(drop=True)
    print("\nTernary sizes -> train:", len(train_ter), "test:", len(test_ter))

    feat_train_ter = build_features_for_df(train_ter, embed_source=embed_source, mode=mode, k=k,
                                           kv_pretrained=w2v_pretrained, kv_own=w2v_own)
    feat_test_ter  = build_features_for_df(test_ter,  embed_source=embed_source, mode=mode, k=k,
                                           kv_pretrained=w2v_pretrained, kv_own=w2v_own)
    # map sentiments 1,2,3 -> 0,1,2
    y_train_ter = (train_ter['sentiment'].astype(int).values - 1).astype(int)
    y_test_ter  = (test_ter['sentiment'].astype(int).values - 1).astype(int)

    X_tr_ter, X_val_ter, y_tr_ter, y_val_ter = train_test_split(
        feat_train_ter, y_train_ter, test_size=0.1, random_state=SEED, stratify=y_train_ter
    )

    model_ter, scaler_ter = run_training(
        X_tr_ter, y_tr_ter, X_val_ter, y_val_ter,
        n_classes=3, hidden1=hidden1, hidden2=hidden2,
        lr=1e-3, weight_decay=1e-5, batch_size=batch_size, epochs=epochs
    )

    X_test_ter_scaled = scaler_ter.transform(feat_test_ter)
    test_ds_ter = ReviewsDataset(None, y_test_ter, X_test_ter_scaled)
    test_loader_ter = DataLoader(test_ds_ter, batch_size=batch_size, shuffle=False, num_workers=2)
    preds_ter, trues_ter = eval_model(model_ter, test_loader_ter)
    acc_ter = accuracy_score(trues_ter, preds_ter)
    print("\n=== TERNARY RESULTS ===")
    print("Test Accuracy:", acc_ter)
    print("Classification report:\n", classification_report(trues_ter, preds_ter, digits=4))
    print("Confusion matrix:\n", confusion_matrix(trues_ter, preds_ter))

    return {
        'binary': {'acc': acc_bin, 'preds': preds_bin, 'trues': trues_bin},
        'ternary': {'acc': acc_ter, 'preds': preds_ter, 'trues': trues_ter},
        'models': {'binary': model_bin, 'ternary': model_ter},
        'scalers': {'binary': scaler_bin, 'ternary': scaler_ter}
    }

In [97]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
res_own_avg = run_experiments(embed_source='own', mode='avg', k=10, hidden1=50, hidden2=10, epochs=12, batch_size=256)
res_own_concat = run_experiments(embed_source='own', mode='concat', k=10, hidden1=50, hidden2=10, epochs=12, batch_size=256)

res_pre_avg = run_experiments(embed_source='pretrained', mode='avg', k=10, hidden1=50, hidden2=10, epochs=12, batch_size=256)
res_pre_concat = run_experiments(embed_source='pretrained', mode='concat', k=10, hidden1=50, hidden2=10, epochs=12, batch_size=256)

Binary sizes -> train: 160000 test: 40000


Building features mode=avg src=own: 100%|██████████| 160000/160000 [00:23<00:00, 6698.00it/s]
Building features mode=avg src=own: 100%|██████████| 40000/40000 [00:06<00:00, 6188.63it/s]


Epoch 1/12 - train_loss: 0.3321 - val_acc: 0.8814
Epoch 2/12 - train_loss: 0.2949 - val_acc: 0.8861
Epoch 3/12 - train_loss: 0.2868 - val_acc: 0.8887
Epoch 4/12 - train_loss: 0.2788 - val_acc: 0.8876
Epoch 5/12 - train_loss: 0.2740 - val_acc: 0.8891
Epoch 6/12 - train_loss: 0.2712 - val_acc: 0.8884
Epoch 7/12 - train_loss: 0.2675 - val_acc: 0.8889
Epoch 8/12 - train_loss: 0.2643 - val_acc: 0.8891
Epoch 9/12 - train_loss: 0.2610 - val_acc: 0.8911
Epoch 10/12 - train_loss: 0.2594 - val_acc: 0.8904
Epoch 11/12 - train_loss: 0.2580 - val_acc: 0.8897
Epoch 12/12 - train_loss: 0.2568 - val_acc: 0.8898

=== BINARY RESULTS ===
Test Accuracy: 0.89075
Classification report:
               precision    recall  f1-score   support

           0     0.8886    0.8935    0.8911     20000
           1     0.8930    0.8880    0.8904     20000

    accuracy                         0.8908     40000
   macro avg     0.8908    0.8907    0.8907     40000
weighted avg     0.8908    0.8908    0.8907     40000


Building features mode=avg src=own: 100%|██████████| 200000/200000 [00:29<00:00, 6713.05it/s]
Building features mode=avg src=own: 100%|██████████| 50000/50000 [00:08<00:00, 6056.70it/s]


Epoch 1/12 - train_loss: 0.7278 - val_acc: 0.7174
Epoch 2/12 - train_loss: 0.6846 - val_acc: 0.7206
Epoch 3/12 - train_loss: 0.6754 - val_acc: 0.7246
Epoch 4/12 - train_loss: 0.6681 - val_acc: 0.7218
Epoch 5/12 - train_loss: 0.6625 - val_acc: 0.7240
Epoch 6/12 - train_loss: 0.6597 - val_acc: 0.7239
Epoch 7/12 - train_loss: 0.6558 - val_acc: 0.7244
Epoch 8/12 - train_loss: 0.6521 - val_acc: 0.7265
Epoch 9/12 - train_loss: 0.6496 - val_acc: 0.7274
Epoch 10/12 - train_loss: 0.6484 - val_acc: 0.7287
Epoch 11/12 - train_loss: 0.6469 - val_acc: 0.7274
Epoch 12/12 - train_loss: 0.6452 - val_acc: 0.7273

=== TERNARY RESULTS ===
Test Accuracy: 0.72692
Classification report:
               precision    recall  f1-score   support

           0     0.7579    0.8606    0.8060     20000
           1     0.7459    0.8427    0.7914     20000
           2     0.4857    0.2279    0.3102     10000

    accuracy                         0.7269     50000
   macro avg     0.6632    0.6437    0.6359     50000

Building features mode=concat src=own: 100%|██████████| 160000/160000 [00:06<00:00, 23225.45it/s]
Building features mode=concat src=own: 100%|██████████| 40000/40000 [00:01<00:00, 39589.71it/s]


Epoch 1/12 - train_loss: 0.4562 - val_acc: 0.7953
Epoch 2/12 - train_loss: 0.4115 - val_acc: 0.7975
Epoch 3/12 - train_loss: 0.3904 - val_acc: 0.7999
Epoch 4/12 - train_loss: 0.3723 - val_acc: 0.8023
Epoch 5/12 - train_loss: 0.3549 - val_acc: 0.8011
Epoch 6/12 - train_loss: 0.3400 - val_acc: 0.7980
Epoch 7/12 - train_loss: 0.3262 - val_acc: 0.7995
Epoch 8/12 - train_loss: 0.3117 - val_acc: 0.7959
Epoch 9/12 - train_loss: 0.3019 - val_acc: 0.7972
Epoch 10/12 - train_loss: 0.2920 - val_acc: 0.8015
Epoch 11/12 - train_loss: 0.2812 - val_acc: 0.7996
Epoch 12/12 - train_loss: 0.2735 - val_acc: 0.7996

=== BINARY RESULTS ===
Test Accuracy: 0.79945
Classification report:
               precision    recall  f1-score   support

           0     0.8075    0.7864    0.7968     20000
           1     0.7918    0.8125    0.8020     20000

    accuracy                         0.7994     40000
   macro avg     0.7997    0.7994    0.7994     40000
weighted avg     0.7997    0.7994    0.7994     40000


Building features mode=concat src=own: 100%|██████████| 200000/200000 [00:06<00:00, 32080.09it/s]
Building features mode=concat src=own: 100%|██████████| 50000/50000 [00:01<00:00, 40013.51it/s]


Epoch 1/12 - train_loss: 0.8518 - val_acc: 0.6444
Epoch 2/12 - train_loss: 0.7984 - val_acc: 0.6491
Epoch 3/12 - train_loss: 0.7776 - val_acc: 0.6494
Epoch 4/12 - train_loss: 0.7607 - val_acc: 0.6485
Epoch 5/12 - train_loss: 0.7456 - val_acc: 0.6491
Epoch 6/12 - train_loss: 0.7317 - val_acc: 0.6477
Epoch 7/12 - train_loss: 0.7196 - val_acc: 0.6488
Epoch 8/12 - train_loss: 0.7084 - val_acc: 0.6477
Epoch 9/12 - train_loss: 0.6987 - val_acc: 0.6466
Epoch 10/12 - train_loss: 0.6867 - val_acc: 0.6462
Epoch 11/12 - train_loss: 0.6770 - val_acc: 0.6444
Epoch 12/12 - train_loss: 0.6696 - val_acc: 0.6447

=== TERNARY RESULTS ===
Test Accuracy: 0.64454
Classification report:
               precision    recall  f1-score   support

           0     0.6533    0.7990    0.7188     20000
           1     0.6729    0.7254    0.6982     20000
           2     0.4372    0.1740    0.2489     10000

    accuracy                         0.6445     50000
   macro avg     0.5878    0.5661    0.5553     50000

Building features mode=avg src=pretrained: 100%|██████████| 160000/160000 [00:22<00:00, 7208.71it/s]
Building features mode=avg src=pretrained: 100%|██████████| 40000/40000 [00:05<00:00, 6746.32it/s]


Epoch 1/12 - train_loss: 0.3990 - val_acc: 0.8453
Epoch 2/12 - train_loss: 0.3561 - val_acc: 0.8512
Epoch 3/12 - train_loss: 0.3457 - val_acc: 0.8562
Epoch 4/12 - train_loss: 0.3383 - val_acc: 0.8566
Epoch 5/12 - train_loss: 0.3331 - val_acc: 0.8610
Epoch 6/12 - train_loss: 0.3267 - val_acc: 0.8590
Epoch 7/12 - train_loss: 0.3228 - val_acc: 0.8618
Epoch 8/12 - train_loss: 0.3198 - val_acc: 0.8599
Epoch 9/12 - train_loss: 0.3171 - val_acc: 0.8638
Epoch 10/12 - train_loss: 0.3150 - val_acc: 0.8616
Epoch 11/12 - train_loss: 0.3111 - val_acc: 0.8636
Epoch 12/12 - train_loss: 0.3093 - val_acc: 0.8629

=== BINARY RESULTS ===
Test Accuracy: 0.862675
Classification report:
               precision    recall  f1-score   support

           0     0.8551    0.8733    0.8641     20000
           1     0.8706    0.8520    0.8612     20000

    accuracy                         0.8627     40000
   macro avg     0.8628    0.8627    0.8627     40000
weighted avg     0.8628    0.8627    0.8627     40000

Building features mode=avg src=pretrained: 100%|██████████| 200000/200000 [00:28<00:00, 7047.64it/s]
Building features mode=avg src=pretrained: 100%|██████████| 50000/50000 [00:07<00:00, 6635.72it/s]


Epoch 1/12 - train_loss: 0.7955 - val_acc: 0.6853
Epoch 2/12 - train_loss: 0.7488 - val_acc: 0.6905
Epoch 3/12 - train_loss: 0.7349 - val_acc: 0.6927
Epoch 4/12 - train_loss: 0.7277 - val_acc: 0.6952
Epoch 5/12 - train_loss: 0.7229 - val_acc: 0.6986
Epoch 6/12 - train_loss: 0.7191 - val_acc: 0.6964
Epoch 7/12 - train_loss: 0.7153 - val_acc: 0.6987
Epoch 8/12 - train_loss: 0.7109 - val_acc: 0.6991
Epoch 9/12 - train_loss: 0.7091 - val_acc: 0.7000
Epoch 10/12 - train_loss: 0.7070 - val_acc: 0.6999
Epoch 11/12 - train_loss: 0.7049 - val_acc: 0.7014
Epoch 12/12 - train_loss: 0.7023 - val_acc: 0.6987

=== TERNARY RESULTS ===
Test Accuracy: 0.70138
Classification report:
               precision    recall  f1-score   support

           0     0.7549    0.8081    0.7806     20000
           1     0.6859    0.8673    0.7660     20000
           2     0.4732    0.1561    0.2348     10000

    accuracy                         0.7014     50000
   macro avg     0.6380    0.6105    0.5938     50000

Building features mode=concat src=pretrained: 100%|██████████| 160000/160000 [00:04<00:00, 34788.67it/s]
Building features mode=concat src=pretrained: 100%|██████████| 40000/40000 [00:01<00:00, 38890.95it/s]


Epoch 1/12 - train_loss: 0.4996 - val_acc: 0.7682
Epoch 2/12 - train_loss: 0.4506 - val_acc: 0.7789
Epoch 3/12 - train_loss: 0.4292 - val_acc: 0.7812
Epoch 4/12 - train_loss: 0.4113 - val_acc: 0.7869
Epoch 5/12 - train_loss: 0.3947 - val_acc: 0.7846
Epoch 6/12 - train_loss: 0.3822 - val_acc: 0.7837
Epoch 7/12 - train_loss: 0.3705 - val_acc: 0.7866
Epoch 8/12 - train_loss: 0.3573 - val_acc: 0.7835
Epoch 9/12 - train_loss: 0.3446 - val_acc: 0.7841
Epoch 10/12 - train_loss: 0.3371 - val_acc: 0.7848
Epoch 11/12 - train_loss: 0.3267 - val_acc: 0.7799
Epoch 12/12 - train_loss: 0.3193 - val_acc: 0.7813

=== BINARY RESULTS ===
Test Accuracy: 0.783375
Classification report:
               precision    recall  f1-score   support

           0     0.7983    0.7583    0.7778     20000
           1     0.7699    0.8084    0.7887     20000

    accuracy                         0.7834     40000
   macro avg     0.7841    0.7834    0.7832     40000
weighted avg     0.7841    0.7834    0.7832     40000

Building features mode=concat src=pretrained: 100%|██████████| 200000/200000 [00:05<00:00, 35544.64it/s]
Building features mode=concat src=pretrained: 100%|██████████| 50000/50000 [00:01<00:00, 37204.99it/s]


Epoch 1/12 - train_loss: 0.8844 - val_acc: 0.6214
Epoch 2/12 - train_loss: 0.8340 - val_acc: 0.6272
Epoch 3/12 - train_loss: 0.8123 - val_acc: 0.6307
Epoch 4/12 - train_loss: 0.7963 - val_acc: 0.6303
Epoch 5/12 - train_loss: 0.7831 - val_acc: 0.6338
Epoch 6/12 - train_loss: 0.7706 - val_acc: 0.6358
Epoch 7/12 - train_loss: 0.7599 - val_acc: 0.6364
Epoch 8/12 - train_loss: 0.7498 - val_acc: 0.6360
Epoch 9/12 - train_loss: 0.7400 - val_acc: 0.6355
Epoch 10/12 - train_loss: 0.7308 - val_acc: 0.6338
Epoch 11/12 - train_loss: 0.7242 - val_acc: 0.6342
Epoch 12/12 - train_loss: 0.7149 - val_acc: 0.6311

=== TERNARY RESULTS ===
Test Accuracy: 0.6328
Classification report:
               precision    recall  f1-score   support

           0     0.6621    0.7528    0.7045     20000
           1     0.6318    0.7550    0.6879     20000
           2     0.4416    0.1485    0.2223     10000

    accuracy                         0.6328     50000
   macro avg     0.5785    0.5521    0.5382     50000


5. Convolutional Neural Networks

In [76]:
MAX_LEN = 50              # max review length (tokens)
EMBED_DIM = 300           # must match your W2V vector size
BATCH_SIZE = 256
EPOCHS = 12
LR = 1e-3
DROPOUT = 0.2

In [79]:

train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)
print("Train:", train_df.shape, "Test:", test_df.shape)

# Load your Word2Vec model (if not already in session)
from gensim.models import Word2Vec
w2v_own = Word2Vec.load(W2V_OWN_PATH)
print("Loaded own w2v; vocab size:", len(w2v_own.wv.key_to_index))
kv = w2v_own.wv

Train: (200000, 3) Test: (50000, 3)
Loaded own w2v; vocab size: 15017


In [80]:

reserved_tokens = ["<PAD>"]  # index 0

token_to_idx = {}
idx = 1
for tok in kv.key_to_index:   # iterates tokens in kv
    token_to_idx[tok] = idx
    idx += 1

vocab_size = idx  # includes padding index 0
print("Vocab size (with PAD=0):", vocab_size)

# Build embedding matrix: shape (vocab_size, EMBED_DIM)
# index 0 = zeros
embedding_matrix = np.zeros((vocab_size, EMBED_DIM), dtype=np.float32)
for tok, i in token_to_idx.items():
    embedding_matrix[i] = kv[tok]  # copy vector

# Optionally convert to torch tensor later
embedding_matrix = torch.tensor(embedding_matrix)

Vocab size (with PAD=0): 15018


In [98]:

def text_to_idx_sequence(text, token_to_idx, max_len=MAX_LEN):
    tokens = tokenize(text)
    indices = []
    for t in tokens[:max_len]:
        indices.append(token_to_idx.get(t, 0))  # 0 for PAD/OOV
    # pad if shorter
    if len(indices) < max_len:
        indices.extend([0] * (max_len - len(indices)))
    return indices

# Quick test
print(text_to_idx_sequence("This is a short test review. Not bad!", token_to_idx)[:10])

[7, 5, 489, 687, 304, 11, 208, 0, 0, 0]


In [99]:

class SeqReviewsDataset(Dataset):
    def __init__(self, texts, labels, token_to_idx, max_len=MAX_LEN):
        self.texts = texts
        self.labels = np.array(labels, dtype=np.int64)
        self.token_to_idx = token_to_idx
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        t = self.texts[idx]
        seq = text_to_idx_sequence(t, self.token_to_idx, self.max_len)
        return torch.tensor(seq, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

In [100]:

class SimpleCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, emb_matrix=None, conv_channels=(50,10), kernel_size=3, dropout=0.2, n_classes=2, freeze_embeddings=True):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if emb_matrix is not None:
            self.embedding.weight.data.copy_(emb_matrix)
        if freeze_embeddings:
            self.embedding.weight.requires_grad = False

        c1, c2 = conv_channels
        self.conv1 = nn.Conv1d(in_channels=embed_dim, out_channels=c1, kernel_size=kernel_size, padding=kernel_size//2)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv1d(in_channels=c1, out_channels=c2, kernel_size=kernel_size, padding=kernel_size//2)

        # after conv2 we do global max pooling over sequence length -> produces c2 features
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(c2, n_classes)

    def forward(self, x):
        # x: (batch, seq_len) long
        emb = self.embedding(x)                # (batch, seq_len, embed_dim)
        emb = emb.permute(0, 2, 1)             # (batch, embed_dim, seq_len) for Conv1d
        h = self.conv1(emb)                    # (batch, c1, seq_len)
        h = self.relu(h)
        h = self.conv2(h)                      # (batch, c2, seq_len)
        h = self.relu(h)
        # global max pooling over seq_len
        h, _ = torch.max(h, dim=2)             # (batch, c2)
        h = self.dropout(h)
        logits = self.fc(h)                    # (batch, n_classes)
        return logits

In [101]:

def train_epoch_cnn(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for X_batch, y_batch in loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        optimizer.zero_grad()
        logits = model(X_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * X_batch.size(0)
    return total_loss / len(loader.dataset)

def eval_cnn(model, loader):
    model.eval()
    preds = []
    trues = []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(device)
            out = model(X_batch)
            pred = torch.argmax(out, dim=1).cpu().numpy()
            preds.extend(pred.tolist())
            trues.extend(y_batch.numpy().tolist())
    return np.array(preds), np.array(trues)

In [102]:

def run_cnn_experiment(train_df, test_df, token_to_idx, emb_matrix, max_len=MAX_LEN, conv_channels=(50,10),
                       kernel_size=3, freeze_embeddings=True, epochs=EPOCHS, batch_size=BATCH_SIZE,
                       lr=LR, dropout=DROPOUT):
    results = {}

    # --- Binary: sentiments 1 & 2
    train_bin = train_df[train_df['sentiment'].isin([1,2])].reset_index(drop=True)
    test_bin  = test_df[test_df['sentiment'].isin([1,2])].reset_index(drop=True)
    print("Binary sizes ->", len(train_bin), len(test_bin))
    # map labels: 1 -> 1, 2 -> 0  (you can also map vice-versa)
    y_train_bin = (train_bin['sentiment'].astype(int).values == 1).astype(int)
    y_test_bin  = (test_bin['sentiment'].astype(int).values == 1).astype(int)

    train_ds_bin = SeqReviewsDataset(train_bin['review_body'].astype(str).tolist(), y_train_bin, token_to_idx, max_len)
    test_ds_bin  = SeqReviewsDataset(test_bin['review_body'].astype(str).tolist(), y_test_bin, token_to_idx, max_len)

    tr_loader = DataLoader(train_ds_bin, batch_size=batch_size, shuffle=True, num_workers=2)
    te_loader = DataLoader(test_ds_bin, batch_size=batch_size, shuffle=False, num_workers=2)

    model_bin = SimpleCNN(vocab_size=emb_matrix.shape[0], embed_dim=emb_matrix.shape[1],
                          emb_matrix=emb_matrix, conv_channels=conv_channels, kernel_size=kernel_size,
                          dropout=dropout, n_classes=2, freeze_embeddings=freeze_embeddings).to(device)

    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model_bin.parameters()), lr=lr)
    criterion = nn.CrossEntropyLoss()

    best_acc = 0.0
    best_state = None
    for epoch in range(1, epochs+1):
        tr_loss = train_epoch_cnn(model_bin, tr_loader, optimizer, criterion)
        preds_val, trues_val = eval_cnn(model_bin, te_loader)
        acc = accuracy_score(trues_val, preds_val)
        print(f"Epoch {epoch}/{epochs} - train_loss: {tr_loss:.4f} - test_acc: {acc:.4f}")
        if acc > best_acc:
            best_acc = acc
            best_state = model_bin.state_dict()

    if best_state is not None:
        model_bin.load_state_dict(best_state)

    preds_bin, trues_bin = eval_cnn(model_bin, te_loader)
    acc_bin = accuracy_score(trues_bin, preds_bin)
    print("\n=== BINARY TEST ACC:", acc_bin)
    print(classification_report(trues_bin, preds_bin, digits=4))
    print("Confusion:\n", confusion_matrix(trues_bin, preds_bin))
    results['binary'] = {'acc': acc_bin, 'preds': preds_bin, 'trues': trues_bin}

    # --- Ternary: sentiments 1,2,3
    train_ter = train_df[train_df['sentiment'].isin([1,2,3])].reset_index(drop=True)
    test_ter  = test_df[test_df['sentiment'].isin([1,2,3])].reset_index(drop=True)
    print("\nTernary sizes ->", len(train_ter), len(test_ter))

    # labels -> 0,1,2
    y_train_ter = (train_ter['sentiment'].astype(int).values - 1).astype(int)
    y_test_ter  = (test_ter['sentiment'].astype(int).values - 1).astype(int)

    train_ds_ter = SeqReviewsDataset(train_ter['review_body'].astype(str).tolist(), y_train_ter, token_to_idx, max_len)
    test_ds_ter  = SeqReviewsDataset(test_ter['review_body'].astype(str).tolist(), y_test_ter, token_to_idx, max_len)

    tr_loader_ter = DataLoader(train_ds_ter, batch_size=batch_size, shuffle=True, num_workers=2)
    te_loader_ter = DataLoader(test_ds_ter, batch_size=batch_size, shuffle=False, num_workers=2)

    model_ter = SimpleCNN(vocab_size=emb_matrix.shape[0], embed_dim=emb_matrix.shape[1],
                          emb_matrix=emb_matrix, conv_channels=conv_channels, kernel_size=kernel_size,
                          dropout=dropout, n_classes=3, freeze_embeddings=freeze_embeddings).to(device)

    optimizer2 = torch.optim.Adam(filter(lambda p: p.requires_grad, model_ter.parameters()), lr=lr)
    criterion2 = nn.CrossEntropyLoss()

    best_acc2 = 0.0
    best_state2 = None
    for epoch in range(1, epochs+1):
        tr_loss = train_epoch_cnn(model_ter, tr_loader_ter, optimizer2, criterion2)
        preds_val, trues_val = eval_cnn(model_ter, te_loader_ter)
        acc = accuracy_score(trues_val, preds_val)
        print(f"Epoch {epoch}/{epochs} - train_loss: {tr_loss:.4f} - test_acc: {acc:.4f}")
        if acc > best_acc2:
            best_acc2 = acc
            best_state2 = model_ter.state_dict()

    if best_state2 is not None:
        model_ter.load_state_dict(best_state2)

    preds_ter, trues_ter = eval_cnn(model_ter, te_loader_ter)
    acc_ter = accuracy_score(trues_ter, preds_ter)
    print("\n=== TERNARY TEST ACC:", acc_ter)
    print(classification_report(trues_ter, preds_ter, digits=4))
    print("Confusion:\n", confusion_matrix(trues_ter, preds_ter))
    results['ternary'] = {'acc': acc_ter, 'preds': preds_ter, 'trues': trues_ter}

    return results

In [103]:

# Convert embedding_matrix to float tensor on CPU (the SimpleCNN copies it into model)
emb_matrix = embedding_matrix.float().cpu()
res = run_cnn_experiment(train_df, test_df, token_to_idx, emb_matrix,
                         max_len=MAX_LEN, conv_channels=(50,10),
                         kernel_size=3, freeze_embeddings=True,
                         epochs=EPOCHS, batch_size=BATCH_SIZE, lr=LR, dropout=DROPOUT)

Binary sizes -> 160000 40000
Epoch 1/12 - train_loss: 0.3760 - test_acc: 0.8659
Epoch 2/12 - train_loss: 0.3175 - test_acc: 0.8770
Epoch 3/12 - train_loss: 0.3017 - test_acc: 0.8834
Epoch 4/12 - train_loss: 0.2894 - test_acc: 0.8840
Epoch 5/12 - train_loss: 0.2767 - test_acc: 0.8864
Epoch 6/12 - train_loss: 0.2685 - test_acc: 0.8878
Epoch 7/12 - train_loss: 0.2601 - test_acc: 0.8893
Epoch 8/12 - train_loss: 0.2514 - test_acc: 0.8890
Epoch 9/12 - train_loss: 0.2438 - test_acc: 0.8872
Epoch 10/12 - train_loss: 0.2367 - test_acc: 0.8887
Epoch 11/12 - train_loss: 0.2290 - test_acc: 0.8908
Epoch 12/12 - train_loss: 0.2230 - test_acc: 0.8882

=== BINARY TEST ACC: 0.888175
              precision    recall  f1-score   support

           0     0.8767    0.9034    0.8899     20000
           1     0.9004    0.8729    0.8864     20000

    accuracy                         0.8882     40000
   macro avg     0.8885    0.8882    0.8881     40000
weighted avg     0.8885    0.8882    0.8881     40000

In [104]:
# ===== PRETRAINED CNN VERSION =====

import gensim.downloader as api
from collections import Counter

print("Loading pretrained GoogleNews vectors...")
w2v_pretrained = api.load("word2vec-google-news-300")
kv_pre = w2v_pretrained
print("Loaded pretrained vectors.")

# Build dataset-specific vocabulary (only words appearing in train+test)
print("Building compact vocabulary from dataset...")
all_texts = pd.concat([
    train_df['review_body'].astype(str),
    test_df['review_body'].astype(str)
]).tolist()

token_set = set()
for text in all_texts:
    token_set.update(tokenize(text))

print("Unique dataset tokens:", len(token_set))

# Create token -> index mapping (0 reserved for PAD)
token_to_idx_pre = {}
idx = 1
for tok in token_set:
    token_to_idx_pre[tok] = idx
    idx += 1

vocab_size_pre = idx
print("Compact vocab size:", vocab_size_pre)

# Build compact embedding matrix
EMBED_DIM = kv_pre.vector_size
embedding_matrix_pre = np.zeros((vocab_size_pre, EMBED_DIM), dtype=np.float32)

missing = 0
for tok, i in token_to_idx_pre.items():
    if tok in kv_pre.key_to_index:
        embedding_matrix_pre[i] = kv_pre[tok]
    else:
        missing += 1

print("Missing tokens from pretrained:", missing)

# Convert to torch tensor
emb_matrix_pre = torch.tensor(embedding_matrix_pre).float().cpu()

# Run CNN experiment (same structure as your own model)
res_pre = run_cnn_experiment(
    train_df,
    test_df,
    token_to_idx_pre,
    emb_matrix_pre,
    max_len=MAX_LEN,
    conv_channels=(50,10),
    kernel_size=3,
    freeze_embeddings=True,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    lr=LR,
    dropout=DROPOUT
)

print("\n===== PRETRAINED CNN RESULTS =====")
print("Binary Accuracy:", res_pre['binary']['acc'])
print("Ternary Accuracy:", res_pre['ternary']['acc'])

Loading pretrained GoogleNews vectors...
Loaded pretrained vectors.
Building compact vocabulary from dataset...
Unique dataset tokens: 70895
Compact vocab size: 70896
Missing tokens from pretrained: 26605
Binary sizes -> 160000 40000
Epoch 1/12 - train_loss: 0.4394 - test_acc: 0.8423
Epoch 2/12 - train_loss: 0.3523 - test_acc: 0.8629
Epoch 3/12 - train_loss: 0.3299 - test_acc: 0.8696
Epoch 4/12 - train_loss: 0.3142 - test_acc: 0.8710
Epoch 5/12 - train_loss: 0.3028 - test_acc: 0.8730
Epoch 6/12 - train_loss: 0.2922 - test_acc: 0.8748
Epoch 7/12 - train_loss: 0.2827 - test_acc: 0.8763
Epoch 8/12 - train_loss: 0.2752 - test_acc: 0.8780
Epoch 9/12 - train_loss: 0.2683 - test_acc: 0.8761
Epoch 10/12 - train_loss: 0.2606 - test_acc: 0.8772
Epoch 11/12 - train_loss: 0.2544 - test_acc: 0.8783
Epoch 12/12 - train_loss: 0.2478 - test_acc: 0.8761

=== BINARY TEST ACC: 0.876125
              precision    recall  f1-score   support

           0     0.8801    0.8710    0.8755     20000
           