In [1]:
!pip install pandas numpy matplotlib seaborn scikit-learn xgboost




In [2]:
#Importing all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier # Corrected: Added explicit import for XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score,
    roc_curve,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score
)

import warnings
warnings.filterwarnings('ignore')


In [3]:
#Simulate a synthetic dataset 
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=5000, n_features=20, n_informative=10,
                           n_redundant=3, n_classes=2, weights=[0.9, 0.1],
                           random_state=42)

# Convert to DataFrame for similarity to real-world structure
X = pd.DataFrame(X, columns=[f"Feature_{i}" for i in range(X.shape[1])])
y = pd.Series(y, name="isFraud")


In [4]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [5]:
#Initialize models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "KNN": KNeighborsClassifier()
}


In [6]:
# Train models and collect evaluation results
trained_models = {} # Corrected: Initialize trained_models dictionary
evaluation_results = [] # Corrected: Initialize evaluation_results list

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    trained_models[name] = model

    # Predict probabilities for ROC-AUC
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] # Probability of the positive class

    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_prob),
        # You can add cross-validation here if desired, e.g.:
        # "CV ROC AUC": cross_val_score(model, X, y, cv=3, scoring='roc_auc').mean()
    }
    evaluation_results.append(metrics)
    print(f"Finished {name}.\n")

Training Random Forest...
Finished Random Forest.

Training XGBoost...
Finished XGBoost.

Training SVM...
Finished SVM.

Training Logistic Regression...
Finished Logistic Regression.

Training KNN...
Finished KNN.



In [7]:
# Cell 7: Convert to DataFrame for dashboard
# No need to recompute evaluation_results_fast if evaluation_results is populated correctly
results_df = pd.DataFrame(evaluation_results).round(4)
results_df_fast = results_df.sort_values(by="ROC AUC", ascending=False).reset_index(drop=True)
results_df_fast

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC
0,SVM,0.971,1.0,0.7184,0.8362,0.9775
1,XGBoost,0.97,1.0,0.7087,0.8295,0.9712
2,Random Forest,0.962,1.0,0.6311,0.7738,0.9627
3,KNN,0.97,1.0,0.7087,0.8295,0.9211
4,Logistic Regression,0.946,0.8551,0.5728,0.686,0.8878


In [8]:
# Initialize models (from Mehak, Harshita, Prateeksha)
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "KNN": KNeighborsClassifier()
}


In [9]:
# Initialize models (from Mehak, Harshita, Prateeksha)
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "KNN": KNeighborsClassifier()
}


In [10]:
# Train models
trained_models = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    trained_models[name] = model

# Evaluation metrics collection
evaluation_results = []

for name, model in trained_models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  # Probabilities for ROC-AUC

    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_prob),
        "CV ROC AUC": cross_val_score(model, X_train, y_train, cv=3, scoring='roc_auc').mean()
    }
    evaluation_results.append(metrics)


In [11]:
# Convert to DataFrame for dashboard
results_df = pd.DataFrame(evaluation_results).round(4)
results_df = results_df.sort_values(by="ROC AUC", ascending=False)
results_df.reset_index(drop=True, inplace=True)

results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC,CV ROC AUC
0,SVM,0.971,1.0,0.7184,0.8362,0.9775,0.9671
1,XGBoost,0.97,1.0,0.7087,0.8295,0.9712,0.9638
2,Random Forest,0.962,1.0,0.6311,0.7738,0.9627,0.962
3,KNN,0.97,1.0,0.7087,0.8295,0.9211,0.9322
4,Logistic Regression,0.946,0.8551,0.5728,0.686,0.8878,0.895


In [12]:
# Simplified evaluation without slow cross-validation for now

# Recompute evaluation without CV for speed
evaluation_results_fast = []

for name, model in trained_models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  # For ROC-AUC

    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_prob)
    }
    evaluation_results_fast.append(metrics)

# Convert to DataFrame
results_df_fast = pd.DataFrame(evaluation_results_fast).round(4)
results_df_fast = results_df_fast.sort_values(by="ROC AUC", ascending=False).reset_index(drop=True)

results_df_fast

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC
0,SVM,0.971,1.0,0.7184,0.8362,0.9775
1,XGBoost,0.97,1.0,0.7087,0.8295,0.9712
2,Random Forest,0.962,1.0,0.6311,0.7738,0.9627
3,KNN,0.97,1.0,0.7087,0.8295,0.9211
4,Logistic Regression,0.946,0.8551,0.5728,0.686,0.8878


In [13]:
# Cell X (New Cell at the end of your Model-Evaluation.ipynb)

import joblib
import pandas as pd # Import pandas if not already imported in this cell scope

print("--- Saving Trained Models and Test Data ---")

# Ensure trained_models, X_test, y_test are available from previous cells
if 'trained_models' in locals() and 'X_test' in locals() and 'y_test' in locals():
    # Save each trained model
    for name, model in trained_models.items():
        filename = f'trained_model_{name.replace(" ", "_").replace("/", "_")}.pkl' # Create a clean filename
        joblib.dump(model, filename)
        print(f"Saved {name} model to {filename}")

    # Save X_test and y_test
    X_test.to_pickle('X_test.pkl')
    y_test.to_pickle('y_test.pkl')
    print("Saved X_test to X_test.pkl")
    print("Saved y_test to y_test.pkl")
else:
    print("Error: 'trained_models', 'X_test', or 'y_test' not found. Please run previous cells.")

print("--- Saving Complete ---")

--- Saving Trained Models and Test Data ---
Saved Random Forest model to trained_model_Random_Forest.pkl
Saved XGBoost model to trained_model_XGBoost.pkl
Saved SVM model to trained_model_SVM.pkl
Saved Logistic Regression model to trained_model_Logistic_Regression.pkl
Saved KNN model to trained_model_KNN.pkl
Saved X_test to X_test.pkl
Saved y_test to y_test.pkl
--- Saving Complete ---
