In [3]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mlflow

# Append parent directory to system path
sys.path.append('..')
from src.traditional_models import (
    train_and_evaluate_model, 
    get_traditional_models,
    plot_model_comparison,
    save_best_model
)

In [4]:
# Set random seed for reproducibility
np.random.seed(42)

In [5]:
# Load the prepared datasets
print("Loading prepared datasets...")
train_data = pd.read_csv('../data/processed/train_data.csv')
display(train_data.head())
test_data = pd.read_csv('../data/processed/test_data.csv')
display(test_data.head())

Loading prepared datasets...


Unnamed: 0,user_id,purchase_value,age,ip_address,time_diff,ip_int,hour_of_day,day_of_week,is_weekend,month,...,user_std_purchase,user_max_purchase,user_min_purchase,user_time_range_days,tx_velocity,source_encoded,browser_encoded,sex_encoded,country_encoded,class
0,306367,0.167258,0.331793,719563900.0,132.950833,719563902,5,3,0,7,...,,0.167258,0.167258,0.0,0.0,0,0,0,36,0
1,349918,-0.651398,-0.248408,3796264000.0,2474.670278,3796264016,16,1,0,6,...,,-0.651398,-0.651398,0.0,0.0,1,2,1,181,0
2,252150,0.549298,0.911994,4106204000.0,2214.139444,4106203903,9,1,0,4,...,,0.549298,0.549298,0.0,0.0,0,2,1,181,0
3,127539,2.677805,1.028034,613943000.0,852.813611,613943026,20,3,0,2,...,,2.677805,2.677805,0.0,0.0,1,1,1,36,0
4,257348,0.167258,1.028034,176356500.0,210.768889,176356458,12,0,0,7,...,,0.167258,0.167258,0.0,0.0,2,4,1,181,0


Unnamed: 0,user_id,purchase_value,age,ip_address,time_diff,ip_int,hour_of_day,day_of_week,is_weekend,month,...,user_std_purchase,user_max_purchase,user_min_purchase,user_time_range_days,tx_velocity,source_encoded,browser_encoded,sex_encoded,country_encoded,class
0,171751,0.385567,0.563874,3961405000.0,2682.805278,3961404649,10,2,0,9,...,,0.385567,0.385567,0.0,0.0,0,3,1,181,0
1,43967,0.494721,0.447833,3150281000.0,2839.298056,3150281030,5,4,0,9,...,,0.494721,0.494721,0.0,0.0,2,3,1,107,0
2,218957,-0.269358,0.331793,20351150.0,1428.988333,20351151,18,1,0,7,...,,-0.269358,-0.269358,0.0,0.0,0,0,0,176,1
3,377211,-0.651398,-0.944649,3568979000.0,1747.151389,3568979293,11,0,0,5,...,,-0.651398,-0.651398,0.0,0.0,2,1,1,8,0
4,225557,0.658452,1.028034,497185100.0,2717.191111,497185119,11,1,0,9,...,,0.658452,0.658452,0.0,0.0,1,0,1,171,0


In [6]:
# Separate features and target
X_train = train_data.drop('class', axis=1)
y_train = train_data['class']
X_test = test_data.drop('class', axis=1)
y_test = test_data['class']

In [7]:
# Handle missing values
print("\nHandling missing values...")
# First, remove any non-numeric columns
numeric_columns = X_train.select_dtypes(include=[np.number]).columns
X_train = X_train[numeric_columns]
X_test = X_test[numeric_columns]


Handling missing values...


In [8]:
# Print column info before imputation
print("Columns before imputation:", X_train.columns.tolist())
print("Number of columns before imputation:", len(X_train.columns))


Columns before imputation: ['user_id', 'purchase_value', 'age', 'ip_address', 'time_diff', 'ip_int', 'hour_of_day', 'day_of_week', 'is_weekend', 'month', 'time_since_signup', 'user_tx_count', 'user_avg_purchase', 'user_std_purchase', 'user_max_purchase', 'user_min_purchase', 'user_time_range_days', 'tx_velocity', 'source_encoded', 'browser_encoded', 'sex_encoded', 'country_encoded']
Number of columns before imputation: 22


In [9]:
# Drop the user_std_purchase column since it has all NaN values
X_train = X_train.drop('user_std_purchase', axis=1)
X_test = X_test.drop('user_std_purchase', axis=1)
numeric_columns = X_train.columns  # Update numeric_columns after dropping


In [10]:
from sklearn.impute import SimpleImputer


imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


In [11]:
# Create DataFrames with the correct columns
X_train = pd.DataFrame(
    X_train_imputed,
    columns=numeric_columns,
    index=X_train.index
)
X_test = pd.DataFrame(
    X_test_imputed,
    columns=numeric_columns,
    index=X_test.index
)

In [12]:
print(f"\nFeatures being used for modeling: {list(X_train.columns)}")
print(f"Number of features: {len(X_train.columns)}")



Features being used for modeling: ['user_id', 'purchase_value', 'age', 'ip_address', 'time_diff', 'ip_int', 'hour_of_day', 'day_of_week', 'is_weekend', 'month', 'time_since_signup', 'user_tx_count', 'user_avg_purchase', 'user_max_purchase', 'user_min_purchase', 'user_time_range_days', 'tx_velocity', 'source_encoded', 'browser_encoded', 'sex_encoded', 'country_encoded']
Number of features: 21


In [13]:
# Initialize MLflow
mlflow.set_experiment("fraud_detection_models")

# Get traditional models
models = get_traditional_models()

In [14]:
# Train and evaluate all models
results = {}
for name, model in models.items():
    trained_model, auc_score = train_and_evaluate_model(
        model, name, X_train, X_test, y_train, y_test
    )
    results[name] = {
        'model': trained_model,
        'auc_score': auc_score
    }


Training Logistic Regression...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Logistic Regression Results:

Classification Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     27393
           1       0.00      0.00      0.00      2830

    accuracy                           0.91     30223
   macro avg       0.45      0.50      0.48     30223
weighted avg       0.82      0.91      0.86     30223


Confusion Matrix:
[[27393     0]
 [ 2830     0]]

ROC AUC Score: 0.7385





Training Decision Tree...

Decision Tree Results:

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95     27393
           1       0.50      0.56      0.53      2830

    accuracy                           0.91     30223
   macro avg       0.73      0.75      0.74     30223
weighted avg       0.91      0.91      0.91     30223


Confusion Matrix:
[[25791  1602]
 [ 1248  1582]]

ROC AUC Score: 0.7503





Training Random Forest...

Random Forest Results:

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27393
           1       1.00      0.53      0.69      2830

    accuracy                           0.96     30223
   macro avg       0.98      0.76      0.83     30223
weighted avg       0.96      0.96      0.95     30223


Confusion Matrix:
[[27393     0]
 [ 1339  1491]]

ROC AUC Score: 0.7598





Training Gradient Boosting...

Gradient Boosting Results:

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27393
           1       1.00      0.53      0.69      2830

    accuracy                           0.96     30223
   macro avg       0.98      0.76      0.83     30223
weighted avg       0.96      0.96      0.95     30223


Confusion Matrix:
[[27392     1]
 [ 1339  1491]]

ROC AUC Score: 0.7710




In [18]:
# Compare model performances
print("\nModel Performance Comparison:")
for name, result in results.items():
    print(f"{name}: AUC = {result['auc_score']:.4f}")



Model Performance Comparison:
Logistic Regression: AUC = 0.7385
Decision Tree: AUC = 0.7503
Random Forest: AUC = 0.7598
Gradient Boosting: AUC = 0.7710


In [16]:
# Plot model comparison
plot_model_comparison(results)

# Save the best model
best_model_name, best_auc = save_best_model(results)
print(f"\nBest performing model: {best_model_name}")
print(f"Best AUC score: {best_auc:.4f}")
print(f"\nBest model saved as: best_model_{best_model_name}.joblib")


Best performing model: Gradient Boosting
Best AUC score: 0.7710

Best model saved as: best_model_Gradient Boosting.joblib
