In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from dataprep_module import load_and_prepare_data
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# Load data from preprocessing module
print("Loading data from dataprep_module...")
x, y, preprocess = load_and_prepare_data()
print(f"Data loaded: {x.shape[0]} samples, {x.shape[1]} features")
print(f"Fraud rate: {y.mean()*100:.2f}%")

# Set up cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print(f"Using {cv.n_splits}-fold stratified cross-validation")

# ADD THIS PART - Preprocess data once and store in memory
print("Preprocessing data once and storing in memory...")
x_processed = preprocess.fit_transform(x)
print(f"Processed data shape: {x_processed.shape}")
print(f"Memory usage: ~{x_processed.nbytes / 1024**3:.2f} GB")
print(f"Data types: {type(x_processed)}")
print(f"Sample values: {x_processed[0][:5]}")  # Show first 5 values

Loading data from dataprep_module...
Data loaded: 20499 samples, 14 features
Fraud rate: 1.91%
Using 5-fold stratified cross-validation
Preprocessing data once and storing in memory...
Processed data shape: (20499, 57933)
Memory usage: ~8.85 GB
Data types: <class 'numpy.ndarray'>
Sample values: [0.51637396 0.31874451 0.         0.         0.        ]


In [22]:
# Model 1: Random Forest

#### TO BE SELECTED
print("="*50)
print("MODEL 1: RANDOM FOREST")
print("="*50)

rf_model = RandomForestClassifier(
    random_state=42,
    class_weight='balanced',
    n_estimators=100,      # Reduced from 500
    max_depth=10,          # Reduced from 15
    min_samples_split=5,  # Increased to reduce complexity
    min_samples_leaf=2,    # Increased to reduce complexity
    max_features='sqrt',
    bootstrap=True,
    oob_score=True,
    n_jobs=-1,             # Use all CPU cores
    # max_samples=0.5,       # Use 80% of samples per tree (reduces memory)
    # warm_start=False       # Don't reuse previous fit
)

scores_rf = {
    "roc_auc": "roc_auc",
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall",
    "f1": "f1"
}

# Cross-validation with preprocessed data (FIXED)
rf_auc_scores = cross_validate(
    rf_model, 
    x_processed , y,           # ← FIXED: Use x_processed, not x
    cv=cv, 
    scoring= scores_rf,
    n_jobs=4
)

for m in scores_rf:
    s = rf_auc_scores[f"test_{m}"]
    print(f"{m.upper():<9}: {s.mean():.3f} (+/- {s.std()*2:.3f})")

#print(f"AUC-ROC: {rf_auc_scores.mean():.3f} (+/- {rf_auc_scores.std()*2:.3f})")
#print(f"Precision: {rf_precision_scores.mean():.3f} (+/- {rf_precision_scores.std()*2:.3f})")
#print(f"Recall: {rf_recall_scores.mean():.3f} (+/- {rf_recall_scores.std()*2:.3f})")
#print(f"F1-Score: {rf_f1_scores.mean():.3f} (+/- {rf_f1_scores.std()*2:.3f})")
# print(f"Individual AUC scores: {rf_auc_scores}")

MODEL 1: RANDOM FOREST
ROC_AUC  : 0.683 (+/- 0.047)
ACCURACY : 0.931 (+/- 0.061)
PRECISION: 0.042 (+/- 0.028)
RECALL   : 0.110 (+/- 0.092)
F1       : 0.058 (+/- 0.036)


In [23]:
# Model 2: Decision Tree
# TO TAKE

print("="*50)
print("MODEL 2: DECISION TREE")
print("="*50)

dt_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("classifier", DecisionTreeClassifier(
        random_state=42,
        class_weight='balanced',
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2
    ))
])

scorers_dt = {
    "roc_auc": "roc_auc",
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall",
    "f1": "f1"
}

# Cross-validation scores
dt_auc_scores = cross_validate(
    dt_model, 
    x, y, 
    cv=cv, 
    scoring=scorers_dt)


for m in scorers_dt:
    s = dt_auc_scores[f"test_{m}"]
    print(f"{m.upper():<9}: {s.mean():.3f} (+/- {s.std()*2:.3f})")


MODEL 2: DECISION TREE


ROC_AUC  : 0.640 (+/- 0.031)
ACCURACY : 0.513 (+/- 0.128)
PRECISION: 0.028 (+/- 0.002)
RECALL   : 0.721 (+/- 0.182)
F1       : 0.054 (+/- 0.003)


In [25]:
print("="*50)
print("MODEL 3: HIST-GRADIENT BOOSTING (EARLY STOP)")
print("="*50)

hgb_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("classifier", HistGradientBoostingClassifier(
        random_state=42,
        max_iter=300,            # like n_estimators
        learning_rate=0.08,
        max_leaf_nodes=31,
        early_stopping=True,     # <- built-in early stopping
        validation_fraction=0.1,
        n_iter_no_change=20
    ))
])

scorers_hgb = {
    "roc_auc": "roc_auc",
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall",
    "f1": "f1"
}

hgb_cv = cross_validate(
    hgb_model, 
    x, y, 
    cv=cv, 
    scoring=scorers_hgb, 
    n_jobs=-1)

for m in scorers_hgb:
    s = hgb_cv[f"test_{m}"]
    print(f"{m.upper():<9}: {s.mean():.3f} (+/- {s.std()*2:.3f})")

MODEL 3: HIST-GRADIENT BOOSTING (EARLY STOP)


  _warn_prf(average, modifier, msg_start, len(result))


ROC_AUC  : 0.673 (+/- 0.047)
ACCURACY : 0.981 (+/- 0.000)
PRECISION: 0.750 (+/- 0.775)
RECALL   : 0.018 (+/- 0.026)
F1       : 0.035 (+/- 0.050)


In [26]:
print("="*50)
print("MODEL 4: EXTRA TREES (PARALLEL & FAST)")
print("="*50)

et_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("classifier", ExtraTreesClassifier(
        random_state=42,
        n_estimators=400,
        max_depth=None,          # let trees grow; try 20 if overfitting
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight='balanced',
        n_jobs=-1,               # parallel
        bootstrap=False,         # typical for ExtraTrees
        max_features="sqrt"
    ))
])

scores_et = {
    "roc_auc": "roc_auc",
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall",
    "f1": "f1"
}

et_cv = cross_validate(
    et_model, 
    x, y, 
    cv=cv, 
    scoring=scores_et,
    n_jobs=-1)
    
for m in scores_et:
    s = et_cv[f"test_{m}"]
    print(f"{m.upper():<9}: {s.mean():.3f} (+/- {s.std()*2:.3f})")


MODEL 4: EXTRA TREES (PARALLEL & FAST)
ROC_AUC  : 0.699 (+/- 0.074)
ACCURACY : 0.940 (+/- 0.015)
PRECISION: 0.066 (+/- 0.039)
RECALL   : 0.155 (+/- 0.053)
F1       : 0.092 (+/- 0.046)


In [None]:
print("="*50)
print("MODEL 5: XGBOOST (FAST)")
print("="*50)

xgb_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("classifier", XGBClassifier(
        random_state=42,
        n_estimators=600,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="hist",   # fast CPU histogram algorithm
        n_jobs=-1,
        reg_lambda=1.0,
        eval_metric="logloss"
    ))
])

scores_xgb = {
    "roc_auc":"roc_auc",
    "accuracy":"accuracy",
    "precision":"precision",
    "recall":"recall",
    "f1":"f1"
    }

xgb_cv = cross_validate(
    xgb_model, x, y, 
    cv=cv, 
    scoring=scores_xgb, 
    n_jobs=-1)

In [4]:
for m in scores_xgb:
    s = xgb_cv[f"test_{m}"] 
    print(f"{m.upper():<9}: {s.mean():.3f} (+/- {s.std()*2:.3f})")


ROC_AUC  : 0.635 (+/- 0.032)
ACCURACY : 0.981 (+/- 0.000)
PRECISION: 0.783 (+/- 0.389)
RECALL   : 0.023 (+/- 0.019)
F1       : 0.044 (+/- 0.036)


In [None]:
# Extra Trees Model - Train/Test Split (80/20) using existing model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

print("="*50)
print("EXTRA TREES - TRAIN/TEST SPLIT (80/20)")
print("="*50)

# Split data into train/test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    x, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Training fraud rate: {y_train.mean()*100:.2f}%")
print(f"Test fraud rate: {y_test.mean()*100:.2f}%")

# Train the Extra Trees model on training data
print("\nTraining Extra Trees model on training data...")
et_model.fit(X_train, y_train)
print("Model training completed!")

# Make predictions on test data
y_pred = et_model.predict(X_test)
y_pred_proba = et_model.predict_proba(X_test)[:, 1]

# Create final_scores_best_model dictionary with all metrics
final_scores_best_model = {
    "roc_auc": roc_auc_score(y_test, y_pred_proba),
    "accuracy": accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1": f1_score(y_test, y_pred)
}

print(f"\nROC-AUC Score: {final_scores_best_model['roc_auc']:.3f}")

print(f"\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
