In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [None]:
PROJECT_ROOT = Path("..")
DATA_PATH = PROJECT_ROOT / "data" / "processed" / "reviews_clean.csv"

reviews_df = pd.read_csv(DATA_PATH)
reviews_df.shape


In [None]:
reviews_df = reviews_df.dropna(subset=["text", "label"]).copy()
reviews_df["text"] = reviews_df["text"].astype(str).str.strip()
reviews_df = reviews_df[reviews_df["text"].str.len() > 0].copy()
reviews_df["label"] = reviews_df["label"].astype(int)

reviews_df.shape


In [None]:
X_text = reviews_df["text"]
y = reviews_df["label"]

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train_text.shape, X_test_text.shape


In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

custom_stopwords = list(ENGLISH_STOP_WORDS - {"not", "no", "never"})

tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words=custom_stopwords,
    max_features=20_000,
    ngram_range=(1, 2)
)

X_train = tfidf.fit_transform(X_train_text)
X_test = tfidf.transform(X_test_text)

X_train.shape, X_test.shape


In [None]:
model = LogisticRegression(
    max_iter=1000, 
    verbose=1,
    class_weight={0: 1.6, 1: 1})

model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("accuracy:", acc)
print("confusion matrix:\n", cm)

In [None]:
print(classification_report(y_test, y_pred, digits=4))

In [None]:
import joblib

ARTIFACTS_DIR = PROJECT_ROOT / "results" / "artifacts"
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

joblib.dump(tfidf, ARTIFACTS_DIR / "tfidf.joblib")
joblib.dump(model, ARTIFACTS_DIR / "logreg_model.joblib")

ARTIFACTS_DIR