## modeling + evaluation

notebook goal: training supervised ml models to find review sentiment, then evaluate outcmomes

notebook todo:
- [x] interpret data
- [x] handle class imbalance
- [x] understand tradeoffs in training basic models


In [29]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [30]:
PROJECT_ROOT = Path("..")
DATA_PATH = PROJECT_ROOT / "data" / "processed" / "reviews_clean.csv"

reviews_df = pd.read_csv(DATA_PATH)
reviews_df.shape


(199985, 5)

In [31]:
reviews_df = reviews_df.dropna(subset=["text", "label"]).copy()
reviews_df["text"] = reviews_df["text"].astype(str).str.strip()
reviews_df = reviews_df[reviews_df["text"].str.len() > 0].copy()
reviews_df["label"] = reviews_df["label"].astype(int)

reviews_df.shape


(199611, 5)

In [32]:
X_text = reviews_df["text"]
y = reviews_df["label"]

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train_text.shape, X_test_text.shape


((159688,), (39923,))

In [33]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

custom_stopwords = list(ENGLISH_STOP_WORDS - {"not", "no", "never"})

tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words=custom_stopwords,
    max_features=20_000,
    ngram_range=(1, 2)
)

X_train = tfidf.fit_transform(X_train_text)
X_test = tfidf.transform(X_test_text)

X_train.shape, X_test.shape


((159688, 20000), (39923, 20000))

### baseline all posative regression

In [34]:
model = LogisticRegression(
    max_iter=1000, 
    verbose=1,
    class_weight={0: 1.6, 1: 1})

model.fit(X_train, y_train)

In [35]:
y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("accuracy:", acc)
print("confusion matrix:\n", cm)

accuracy: 0.9416627006988453
confusion matrix:
 [[ 2193  1509]
 [  820 35401]]


In [36]:
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.7278    0.5924    0.6532      3702
           1     0.9591    0.9774    0.9682     36221

    accuracy                         0.9417     39923
   macro avg     0.8435    0.7849    0.8107     39923
weighted avg     0.9377    0.9417    0.9389     39923



In [37]:
import joblib

ARTIFACTS_DIR = PROJECT_ROOT / "results" / "artifacts"
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

joblib.dump(tfidf, ARTIFACTS_DIR / "tfidf.joblib")
joblib.dump(model, ARTIFACTS_DIR / "logreg_model.joblib")

ARTIFACTS_DIR

PosixPath('../results/artifacts')

In [38]:
import numpy as np
import pandas as pd

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    f1_score
)

### naive bayes

In [39]:
y_pred_baseline = np.ones_like(y_test)

print("accuracy:", accuracy_score(y_test, y_pred_baseline))
print("macro f1:", f1_score(y_test, y_pred_baseline, average="macro"))
print("confusion matrix:\n", confusion_matrix(y_test, y_pred_baseline))

accuracy: 0.9072714976329435
macro f1: 0.4756907963857953
confusion matrix:
 [[    0  3702]
 [    0 36221]]


In [40]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

y_pred_nb = nb_model.predict(X_test)

print("accuracy:", accuracy_score(y_test, y_pred_nb))
print("macro f1:", f1_score(y_test, y_pred_nb, average="macro"))
print("confusion matrix:\n", confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb, digits=4))

accuracy: 0.9323698118878841
macro f1: 0.7387604016121768
confusion matrix:
 [[ 1427  2275]
 [  425 35796]]
              precision    recall  f1-score   support

           0     0.7705    0.3855    0.5139      3702
           1     0.9402    0.9883    0.9637     36221

    accuracy                         0.9324     39923
   macro avg     0.8554    0.6869    0.7388     39923
weighted avg     0.9245    0.9324    0.9219     39923



### weighted regression

In [41]:
from sklearn.linear_model import LogisticRegression

logreg_model = LogisticRegression(
    max_iter=1000,
    class_weight={0: 1.6, 1: 1}
)

logreg_model.fit(X_train, y_train)
y_pred_lr = logreg_model.predict(X_test)

print("accuracy:", accuracy_score(y_test, y_pred_lr))
print("macro f1:", f1_score(y_test, y_pred_lr, average="macro"))
print("confusion matrix:\n", confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, digits=4))

accuracy: 0.9416627006988453
macro f1: 0.8106587986981573
confusion matrix:
 [[ 2193  1509]
 [  820 35401]]
              precision    recall  f1-score   support

           0     0.7278    0.5924    0.6532      3702
           1     0.9591    0.9774    0.9682     36221

    accuracy                         0.9417     39923
   macro avg     0.8435    0.7849    0.8107     39923
weighted avg     0.9377    0.9417    0.9389     39923



In [42]:
results = pd.DataFrame([
    {
        "model": "baseline (always positive)",
        "accuracy": accuracy_score(y_test, y_pred_baseline),
        "macro_f1": f1_score(y_test, y_pred_baseline, average="macro"),
    },
    {
        "model": "naive bayes",
        "accuracy": accuracy_score(y_test, y_pred_nb),
        "macro_f1": f1_score(y_test, y_pred_nb, average="macro"),
    },
    {
        "model": "logistic regression (weighted)",
        "accuracy": accuracy_score(y_test, y_pred_lr),
        "macro_f1": f1_score(y_test, y_pred_lr, average="macro"),
    },
])

results

Unnamed: 0,model,accuracy,macro_f1
0,baseline (always positive),0.907271,0.475691
1,naive bayes,0.93237,0.73876
2,logistic regression (weighted),0.941663,0.810659


### evaluation <= notebooks 1-3

- accuracy is a bad perfomance metric due to imbalance in data
-- macro f1 was a stronger metric for performance
- best result was logistic regression with manual weighting
- difficulty in data with short reviews and sarcastic text 
