In [4]:
import pandas as pd
df = pd.read_csv('Data/final.csv')
df.head()

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,avg_txn_amount,errorBalanceDest,balanceDiffDest,isFlaggedFraud,log_amount,hour,zero_balance_flag,isFraud
0,0.0,0.0,0.0,1.0,0.0,9839.64,9839.64,9839.64,-9839.64,0,9.194276,1,0,0
1,0.0,0.0,0.0,1.0,0.0,1864.28,1864.28,1864.28,-1864.28,0,7.531166,1,0,0
2,0.0,0.0,0.0,0.0,1.0,181.0,181.0,181.0,-181.0,0,5.204007,1,0,1
3,0.0,1.0,0.0,0.0,0.0,181.0,181.0,21363.0,-21363.0,0,5.204007,1,0,1
4,0.0,0.0,0.0,1.0,0.0,11668.14,11668.14,11668.14,-11668.14,0,9.364703,1,0,0


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()

X = df.drop(['isFraud'],axis=1)
y = df['isFraud']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)


In [6]:
# === Multi-Model GridSearchCV with StratifiedKFold ===
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, average_precision_score

# 1️⃣ Define Stratified K-Fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 2️⃣ Define models and their hyperparameter grids
models = {
    "LogisticRegression": (
        LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
        {
            'C': [0.01, 0.1, 1, 10],
            'solver': ['lbfgs', 'liblinear']
        }
    ),
    "RandomForest": (
        RandomForestClassifier(class_weight='balanced', random_state=42),
        {
            'n_estimators': [100, 300],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5]
        }
    ),
    "XGBoost": (
        XGBClassifier(
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=42,
            scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum()  # handle imbalance
        ),
        {
            'n_estimators': [100, 300],
            'max_depth': [3, 6],
            'learning_rate': [0.01, 0.1]
        }
    )
}

# 3️⃣ Train, tune, and evaluate each model
best_models = {}
for name, (model, params) in models.items():
    print(f"\n🔹 Training and tuning {name}...")
    grid = GridSearchCV(
        estimator=model,
        param_grid=params,
        scoring='average_precision',  # PR-AUC for imbalance
        cv=cv,
        n_jobs=-1,
        verbose=1
    )
    
    grid.fit(X_train, y_train)
    
    print(f"✅ Best params for {name}: {grid.best_params_}")
    print(f"✅ Best CV avg precision: {grid.best_score_:.4f}")
    
    # Save the best model
    best_models[name] = grid.best_estimator_

# 4️⃣ Evaluate all best models on test data
print("\n=== Test Set Evaluation ===")
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    print(f"\n🧾 {name} Results:")
    print(classification_report(y_test, y_pred, digits=4))
    print("Average Precision (AUC-PR):", average_precision_score(y_test, y_proba))



🔹 Training and tuning LogisticRegression...
Fitting 5 folds for each of 8 candidates, totalling 40 fits


MemoryError: Unable to allocate 311. MiB for an array with shape (10, 4072077) and data type float64