In [3]:
import sys
sys.path.append("..")

In [4]:
import pandas as pd
import joblib

from src.train_model import (
    load_cleaned_data,
    split_data
)

from src.model_training import (
    train_logistic_regression,
    train_naive_bayes,
    evaluate_model,
    save_model
)

In [5]:
df = load_cleaned_data("../data/processed/reddit_clean.csv")
print("Dataset loaded:", df.shape)

Dataset loaded: (2327087, 10)


In [6]:
X_train, X_test, y_train, y_test = split_data(df)

print("Train size:", len(X_train))
print("Test size:", len(X_test))

Train size: 1861669
Test size: 465418


In [8]:
vectorizer = joblib.load("../models/tfidf_vectorizer.pkl")
print("TF-IDF vectorizer loaded")

TF-IDF vectorizer loaded


In [9]:
X_train_tfidf = vectorizer.transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("TF-IDF shapes:")
print(X_train_tfidf.shape, X_test_tfidf.shape)

TF-IDF shapes:
(1861669, 10000) (465418, 10000)


In [10]:
print("Training Logistic Regression...")
log_reg = train_logistic_regression(X_train_tfidf, y_train)

Training Logistic Regression...


In [11]:
print("Evaluating Logistic Regression...")
lr_results = evaluate_model(log_reg, X_test_tfidf, y_test)

Evaluating Logistic Regression...
              precision    recall  f1-score   support

           0       0.97      0.92      0.95    370184
           1       0.75      0.90      0.82     95234

    accuracy                           0.92    465418
   macro avg       0.86      0.91      0.88    465418
weighted avg       0.93      0.92      0.92    465418

Confusion Matrix:
[[341902  28282]
 [  9636  85598]]


In [13]:
save_model(log_reg, "../models/logistic_regression.pkl")
print("Logistic Regression saved")

Logistic Regression saved


In [14]:
print("Training Naive Bayes...")
nb_model = train_naive_bayes(X_train_tfidf, y_train)

Training Naive Bayes...


In [15]:
print("Evaluating Naive Bayes...")
nb_results = evaluate_model(nb_model, X_test_tfidf, y_test)

Evaluating Naive Bayes...
              precision    recall  f1-score   support

           0       0.93      0.97      0.95    370184
           1       0.86      0.70      0.77     95234

    accuracy                           0.92    465418
   macro avg       0.90      0.84      0.86    465418
weighted avg       0.91      0.92      0.91    465418

Confusion Matrix:
[[359684  10500]
 [ 28550  66684]]


In [16]:
save_model(nb_model, "../models/naive_bayes.pkl")
print("Naive Bayes saved")

Naive Bayes saved
