In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import xgboost as xgb
import joblib
import pickle
import os

In [2]:
df = pd.read_pickle('../data/preprocessed_scam_data.pkl')
print('Loaded data shape:', df.shape)

Loaded data shape: (545, 20)


In [3]:
# Load TF-IDF matrix
tfidf_matrix = load_npz('../data/tfidf_matrix.npz')
print('TF-IDF matrix shape:', tfidf_matrix.shape)

TF-IDF matrix shape: (545, 3447)


In [4]:
# Convert text labels to numeric (scam: 1, legit: 0)
labels = df['label'].map({'scam': 1, 'legit': 0})
print('Label distribution:', labels.value_counts())

Label distribution: label
1    348
0    197
Name: count, dtype: int64


In [5]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, labels, test_size=0.2, random_state=42, stratify=labels)

In [6]:
# Function to evaluate model
def evaluate_model(y_true, y_pred, y_pred_proba=None):
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1-score': f1_score(y_true, y_pred)
    }
    if y_pred_proba is not None:
        metrics['ROC-AUC'] = roc_auc_score(y_true, y_pred_proba)
    cm = confusion_matrix(y_true, y_pred)
    print('Metrics:', metrics)
    print('Confusion Matrix:\n', cm)
    return metrics

In [7]:
# Train XGBoost
print('Training XGBoost...')
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_pred_proba = xgb_model.predict_proba(X_test)[:, 1]
xgb_metrics = evaluate_model(y_test, xgb_pred, xgb_pred_proba)

Training XGBoost...
Metrics: {'Accuracy': 0.9357798165137615, 'Precision': 0.9436619718309859, 'Recall': 0.9571428571428572, 'F1-score': 0.950354609929078, 'ROC-AUC': 0.947985347985348}
Confusion Matrix:
 [[35  4]
 [ 3 67]]


In [8]:
# Save model and metrics
joblib.dump(xgb_model, '../models/xgboost_v1.joblib')
with open('../metrics/xgboost_v1.pkl', 'wb') as f:
    pickle.dump(xgb_metrics, f)

print('XGBoost complete. Model and metrics saved.')

XGBoost complete. Model and metrics saved.
