In [31]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import numpy as np
import torch
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [None]:
# Load full dataset (no sampling)
df = pd.read_csv('dataset/amazon-fine-food-reviews/Reviews.csv')
df = df.dropna(subset=['Text', 'Score']).reset_index(drop=True)

X = df['Text']
y = df['Score'].astype(int)

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # eval mode

def get_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[0, 0, :]
    return cls_embedding.cpu().numpy()

total = len(X)
increments = [0.2, 0.4, 0.6, 0.8, 1.0]

for frac in increments:
    size = math.ceil(total * frac)
    embeddings = []
    print(f"Generating embeddings for {int(frac*100)}% of data ({size} samples)")
    
    for i in tqdm(range(size)):
        emb = get_embedding(X.iloc[i], model, tokenizer)
        embeddings.append(emb)
    
    embeddings = np.array(embeddings)
    filename = f'amazon_bert_embeddings_{int(frac*100)}.npy'
    np.save(filename, embeddings)
    print(f"Saved embeddings to {filename}")


Generating embeddings for 100% of data (568454 samples)


  0%|          | 66/568454 [00:06<14:29:22, 10.90it/s]


KeyboardInterrupt: 

In [28]:
# Cell 2: Load embeddings, split data, train and evaluate logistic regression

embeddings = np.load('amazon_bert_embeddings.npy')

X_train, X_test, y_train, y_test = train_test_split(
    embeddings, y, test_size=0.2, random_state=42, stratify=y)

clf = LogisticRegression(max_iter=1000, multi_class='ovr')  # multi-class logistic regression
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       0.50      0.62      0.55        21
           2       0.14      0.08      0.10        13
           3       0.20      0.12      0.15        16
           4       0.20      0.11      0.14        27
           5       0.74      0.85      0.79       123

    accuracy                           0.62       200
   macro avg       0.36      0.36      0.35       200
weighted avg       0.56      0.62      0.58       200

