In [9]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import pickle
import joblib



In [10]:
# Load preprocessed data
df = pd.read_pickle('../data/preprocessed_scam_data.pkl')
print('Loaded data shape:', df.shape)

Loaded data shape: (545, 20)


In [11]:
# Load TF-IDF matrix
tfidf_matrix = load_npz('../data/tfidf_matrix.npz')
print('TF-IDF matrix shape:', tfidf_matrix.shape)

TF-IDF matrix shape: (545, 3447)


In [12]:
# Convert text labels to numeric (scam: 1, legit: 0)
labels = df['label'].map({'scam': 1, 'legit': 0})
print('Label distribution:', labels.value_counts())

Label distribution: label
1    348
0    197
Name: count, dtype: int64


In [13]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, labels, test_size=0.2, random_state=42, stratify=labels)

In [14]:
# Function to evaluate model
def evaluate_model(y_true, y_pred, y_pred_proba=None):
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1-score': f1_score(y_true, y_pred)
    }
    if y_pred_proba is not None:
        metrics['ROC-AUC'] = roc_auc_score(y_true, y_pred_proba)
    cm = confusion_matrix(y_true, y_pred)
    print('Metrics:', metrics)
    print('Confusion Matrix:\n', cm)
    return metrics

In [15]:
print('Training Logistic Regression...')
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_pred_proba = lr_model.predict_proba(X_test)[:, 1]
lr_metrics = evaluate_model(y_test, lr_pred, lr_pred_proba)

Training Logistic Regression...
Metrics: {'Accuracy': 0.8256880733944955, 'Precision': 0.7865168539325843, 'Recall': 1.0, 'F1-score': 0.8805031446540881, 'ROC-AUC': 0.9805860805860805}
Confusion Matrix:
 [[20 19]
 [ 0 70]]


In [None]:
# Save model and metrics
joblib.dump(lr_model, '../models/logistic_regression_v1.joblib')
with open('../metrics/logistic_regression_v1.pkl', 'wb') as f:
    pickle.dump(lr_metrics, f)

print('Logistic Regression complete. Model and metrics saved.')

Logistic Regression complete. Model and metrics saved.
