In [1]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv("processed_emails.csv")

X = df["text_clean"].astype(str)
y = df["label"].astype(int)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [4]:
model = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        ngram_range=(1, 2),
        max_df=0.95
    )),
    ("clf", LogisticRegression(
        max_iter=1000,
        class_weight="balanced"
    ))
])

In [5]:
model.fit(X_train, y_train)

In [6]:
y_pred = model.predict(X_test)

In [7]:
print("\n✅ Classification Report:")
print(classification_report(y_test, y_pred, digits=4))

print("✅ Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


✅ Classification Report:
              precision    recall  f1-score   support

           0     0.9988    0.9874    0.9931       872
           1     0.9613    0.9964    0.9785       274

    accuracy                         0.9895      1146
   macro avg     0.9801    0.9919    0.9858      1146
weighted avg     0.9899    0.9895    0.9896      1146

✅ Confusion Matrix:
[[861  11]
 [  1 273]]
