## 1. Import Libraries

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import joblib

## 2. Load Cleaned Data

In [2]:
DATA_PATH = "F:\\Projects\\InfosysVirtualInternship-BFSI\\backend\\data\\processed\\transactions_clean.csv"
df = pd.read_csv(DATA_PATH)
print("Dataset Loaded")
print(f"Rows: {len(df)}, Columns: {df.shape[1]}")
df.head()

Dataset Loaded
Rows: 5000, Columns: 17


Unnamed: 0,transaction_id,customer_id,account_age_days,transaction_amount,timestamp,is_fraud,hour,weekday,month,is_high_value,transaction_amount_log,channel_Atm,channel_Mobile,channel_Pos,channel_Web,kyc_verified_No,kyc_verified_Yes
0,TXN_200000,CUST_799,1050,256369.0,2025-08-12 02:10:24,0,2,1,8,1,12.454377,0.0,1.0,0.0,0.0,0.0,1.0
1,TXN_200001,CUST_484,295,6581.0,2025-08-25 01:14:31,0,1,0,8,0,8.792094,0.0,1.0,0.0,0.0,0.0,1.0
2,TXN_200002,CUST_791,2083,4492.0,2025-08-17 12:12:40,0,12,6,8,0,8.410276,0.0,1.0,0.0,0.0,0.0,1.0
3,TXN_200003,CUST_664,2789,275413.0,2025-08-07 06:23:54,0,6,3,8,1,12.526031,0.0,0.0,1.0,0.0,0.0,1.0
4,TXN_200004,CUST_157,694,98098.0,2025-08-20 21:55:54,0,21,2,8,1,11.493732,0.0,0.0,1.0,0.0,0.0,1.0


## 3. Prepare Features & Target

In [3]:
drop_cols = ["transaction_id", "customer_id", "timestamp"]
X = df.drop(columns=[col for col in drop_cols if col in df.columns] + ["is_fraud"], errors="ignore")
y = df["is_fraud"]

print(f"Feature matrix shape: {X.shape}")
print(f"Target imbalance: {y.value_counts(normalize=True).to_dict()}")

Feature matrix shape: (5000, 13)
Target imbalance: {0: 0.9136, 1: 0.0864}


## 4. Split into Train / Validation / Test

In [4]:
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.1765, random_state=42, stratify=y_train_full
)

print(f"Split sizes -> Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

Split sizes -> Train: 3499, Validation: 751, Test: 750


## 5. Save Split Data

In [5]:
os.makedirs("data/processed", exist_ok=True)
for name, data_x, data_y, fname in [
    ("Train", X_train, y_train, "train.csv"),
    ("Validation", X_val, y_val, "validation.csv"),
    ("Test", X_test, y_test, "test.csv"),
]:
    df_split = data_x.copy()
    df_split["is_fraud"] = data_y.values
    df_split.to_csv(f"data/processed/{fname}", index=False)
    print(f"Saved {name} -> data/processed/{fname}")

Saved Train -> data/processed/train.csv
Saved Validation -> data/processed/validation.csv
Saved Test -> data/processed/test.csv


## 6. Initialize Models

In [6]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=200, solver="liblinear"),
    "Random Forest": RandomForestClassifier(
        n_estimators=200, random_state=42
    ),
    "XGBoost": XGBClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=5,
        random_state=42,
        eval_metric="logloss",
        use_label_encoder=False
    )
}

## 7. Train, Validate, and Test Each Model

In [7]:
results = []

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)

    # Validation performance
    y_val_pred = model.predict(X_val)
    y_val_proba = model.predict_proba(X_val)[:, 1]
    val_report = classification_report(y_val, y_val_pred, output_dict=True)
    val_auc = roc_auc_score(y_val, y_val_proba)

    # Test performance
    y_test_pred = model.predict(X_test)
    y_test_proba = model.predict_proba(X_test)[:, 1]
    test_report = classification_report(y_test, y_test_pred, output_dict=True)
    test_auc = roc_auc_score(y_test, y_test_proba)

    # Log results
    results.append({
        "Model": name,
        "Val_Accuracy": val_report["accuracy"],
        "Val_Precision": val_report["1"]["precision"],
        "Val_Recall": val_report["1"]["recall"],
        "Val_F1": val_report["1"]["f1-score"],
        "Val_ROC-AUC": val_auc,
        "Test_Accuracy": test_report["accuracy"],
        "Test_Precision": test_report["1"]["precision"],
        "Test_Recall": test_report["1"]["recall"],
        "Test_F1": test_report["1"]["f1-score"],
        "Test_ROC-AUC": test_auc
    })

    print(f"\n{name} Validation Results:")
    print(classification_report(y_val, y_val_pred))
    print(f"ROC-AUC (Validation): {val_auc:.4f}")

    print(f"\n{name} Test Results:")
    print(classification_report(y_test, y_test_pred))
    print(f"ROC-AUC (Test): {test_auc:.4f}")
    print("Confusion Matrix (Test):\n", confusion_matrix(y_test, y_test_pred))


Training Logistic Regression...

Logistic Regression Validation Results:
              precision    recall  f1-score   support

           0       0.91      1.00      0.96       686
           1       1.00      0.02      0.03        65

    accuracy                           0.91       751
   macro avg       0.96      0.51      0.49       751
weighted avg       0.92      0.91      0.88       751

ROC-AUC (Validation): 0.7101

Logistic Regression Test Results:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       685
           1       0.00      0.00      0.00        65

    accuracy                           0.91       750
   macro avg       0.46      0.50      0.48       750
weighted avg       0.83      0.91      0.87       750

ROC-AUC (Test): 0.7832
Confusion Matrix (Test):
 [[683   2]
 [ 65   0]]

Training Random Forest...

Random Forest Validation Results:
              precision    recall  f1-score   support

           0       

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost Validation Results:
              precision    recall  f1-score   support

           0       0.91      0.99      0.95       686
           1       0.00      0.00      0.00        65

    accuracy                           0.90       751
   macro avg       0.46      0.49      0.47       751
weighted avg       0.83      0.90      0.87       751

ROC-AUC (Validation): 0.7088

XGBoost Test Results:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       685
           1       0.80      0.06      0.11        65

    accuracy                           0.92       750
   macro avg       0.86      0.53      0.54       750
weighted avg       0.91      0.92      0.88       750

ROC-AUC (Test): 0.7327
Confusion Matrix (Test):
 [[684   1]
 [ 61   4]]


## 8. Compare Models

In [8]:
results_df = pd.DataFrame(results).sort_values(by="Val_ROC-AUC", ascending=False)
print("\nModel Comparison (Validation + Test):")
results_df


Model Comparison (Validation + Test):


Unnamed: 0,Model,Val_Accuracy,Val_Precision,Val_Recall,Val_F1,Val_ROC-AUC,Test_Accuracy,Test_Precision,Test_Recall,Test_F1,Test_ROC-AUC
1,Random Forest,0.904128,0.0,0.0,0.0,0.713512,0.910667,0.375,0.046154,0.082192,0.732117
0,Logistic Regression,0.91478,1.0,0.015385,0.030303,0.710114,0.910667,0.0,0.0,0.0,0.783223
2,XGBoost,0.904128,0.0,0.0,0.0,0.708769,0.917333,0.8,0.061538,0.114286,0.732734


## 9. Save Comparison & Best Model

In [10]:
os.makedirs("models", exist_ok=True)

# Use safe, non-escaped paths and ensure directories exist
base_dir = r"F:\Projects\InfosysVirtualInternship-BFSI\backend"
comp_dir = os.path.join(base_dir, "Model_Comparison")
model_dir = os.path.join(base_dir, "final_fraud_model")
os.makedirs(comp_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

comp_path = os.path.join(comp_dir, "model_comparison.csv")
results_df.to_csv(comp_path, index=False)

best_model_name = results_df.iloc[0]["Model"]
best_model = models[best_model_name]

model_filename = f"{best_model_name.replace(' ', '_').lower()}_model.pkl"
model_path = os.path.join(model_dir, model_filename)
joblib.dump(best_model, model_path)

print(f"\nAll models evaluated. Best model: {best_model_name}")
print(f"Saved model to: {model_path}")
print(f"Model comparison saved to: {comp_path}")


All models evaluated. Best model: Random Forest
Saved model to: F:\Projects\InfosysVirtualInternship-BFSI\backend\final_fraud_model\random_forest_model.pkl
Model comparison saved to: F:\Projects\InfosysVirtualInternship-BFSI\backend\Model_Comparison\model_comparison.csv
