In [21]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from time import perf_counter

notebook_dir = Path().resolve()
sys.path.append(str(notebook_dir.parent / "src"))

from preprocessor.preprocessing import IMDBPreprocessor
from models.naive_bayes import NaiveBayes
from models.knn import KNN
from vectorizer.bag_of_words import BoWVectorizer  
from vectorizer.tfidf import TfidfVectorizerScratch
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split

project_root = notebook_dir.parent

In [22]:
df = pd.read_parquet(project_root / "data" / "imdb_reviews.parquet")
df = df[["review", "sentiment"]].dropna().reset_index(drop=True)

In [23]:
from sklearn.model_selection import train_test_split
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df["review"].astype(str), df["sentiment"].astype(int), test_size=0.2, random_state=42, stratify=df["sentiment"]
)

In [24]:
X_train_tok = X_train_text.str.split().tolist()
X_test_tok  = X_test_text.str.split().tolist()

## BOW + naive bayes

In [25]:
vectorizers = {
    "bow": lambda: BoWVectorizer(binary=False, min_df=2),
    "bow_binary": lambda: BoWVectorizer(binary=True, min_df=2),
    "tfidf": lambda: TfidfVectorizerScratch(min_df=2, sublinear_tf=False, l2_norm=False),
    "tfidf_norm": lambda: TfidfVectorizerScratch(min_df=2, sublinear_tf=True, l2_norm=True),
}

models = {
    "naive_bayes": lambda: NaiveBayes(alpha=1.0),
    "knn": lambda: KNN(k=7, distance_metric="cosine"),
}

In [26]:
results = []
artifacts = {}  # (vec_name, model_name) -> dict(vectorizer, model, y_pred, X_tr, X_te)

for v_name, v_ctor in vectorizers.items():
    vec = v_ctor()
    t0 = perf_counter()
    X_tr = vec.fit_transform(X_train_tok)
    X_te = vec.transform(X_test_tok)
    vec_time = perf_counter() - t0

    for m_name, m_ctor in models.items():
        model = m_ctor()
        t1 = perf_counter()
        model.fit(X_tr, y_train.tolist())
        fit_time = perf_counter() - t1

        t2 = perf_counter()
        y_pred = model.predict(X_te)
        pred_time = perf_counter() - t2

        acc = accuracy_score(y_test, y_pred)
        f1  = f1_score(y_test, y_pred)

        results.append({
            "vectorizer": v_name,
            "model": m_name,
            "accuracy": acc,
            "f1": f1,
            "vec_time_s": round(vec_time, 3),
            "fit_time_s": round(fit_time, 3),
            "pred_time_s": round(pred_time, 3),
        })

        artifacts[(v_name, m_name)] = {
            "vectorizer": vec,
            "model": model,
            "y_pred": y_pred,
            "X_tr": X_tr,
            "X_te": X_te,
        }

results_df = pd.DataFrame(results).sort_values(["accuracy", "f1"], ascending=False).reset_index(drop=True)
results_df

MemoryError: Unable to allocate 4.70 GiB for an array with shape (19758, 63814) and data type float32