In [None]:
!pip install --only-binary=:all: numpy==1.24.3
!pip install --only-binary=:all: pandas scipy scikit-learn
!pip install --only-binary=:all: lightgbm xgboost catboost
!pip install --only-binary=:all: imbalanced-learn optuna

In [1]:
import numpy as np
import pandas as pd
import time
import optuna
import gc
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.metrics import (classification_report, roc_auc_score, roc_curve,
                              precision_recall_curve, average_precision_score,
                              precision_score, recall_score, f1_score, confusion_matrix)
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


In [None]:

start_time = time.time()


In [None]:


file_path = "/Users/pratyushmalviya/Desktop/fraud final model 26:4:25/creditcard.csv"
df = pd.read_csv(file_path)



Exploratory data analysis-
performing  a basic exploratory analysis on the dataframe , particularly focusing on class distribution.

In [None]:
print("=== EXPLORATORY DATA ANALYSIS ===")
print(f"Dataset shape: {df.shape}")
print(f"Missing values: {df.isnull().sum().sum()}")
class_counts = df['Class'].value_counts()
total = len(df)
print("\nClass Distribution:")
for cls, count in class_counts.items():
    print(f"Class {cls}: {count} samples ({count/total*100:.3f}%)")


=== EXPLORATORY DATA ANALYSIS ===
Dataset shape: (284807, 31)
Missing values: 0

Class Distribution:
Class 0: 284315 samples (99.827%)
Class 1: 492 samples (0.173%)


Time feature analysis - 
To identify hourly patterns in fraudulent behavior, which can inform feature engineering

In [None]:

if 'Time' in df.columns:

    df['Hour'] = df['Time'] // 3600
    fraud_by_hour = df[df['Class'] == 1].groupby('Hour').size()
    hourly_fraud_rate = df.groupby('Hour')['Class'].mean() * 100

    print("\n=== Time Features Analysis ===")
    print("\nFraud Transactions by Hour:")
    print(fraud_by_hour)

    print("\nFraud Rate by Hour (%):")
    print(hourly_fraud_rate)



=== Time Features Analysis ===

Fraud Transactions by Hour:
Hour
0.0      2
1.0      2
2.0     21
3.0     13
4.0      6
5.0     11
6.0      3
7.0     23
8.0      5
9.0     15
10.0     2
11.0    43
12.0     9
13.0     9
14.0    13
15.0    14
16.0    14
17.0    12
18.0    15
19.0     7
20.0     8
21.0    14
22.0     3
23.0    17
24.0     4
25.0     8
26.0    36
27.0     4
28.0    17
30.0     6
32.0     4
33.0     1
34.0     6
35.0    10
36.0     8
37.0     8
38.0    10
39.0    12
40.0     8
41.0    17
42.0    18
43.0    12
44.0    10
45.0     2
46.0     6
47.0     4
dtype: int64

Fraud Rate by Hour (%):
Hour
0.0     0.050467
1.0     0.090212
2.0     1.332487
3.0     0.713893
4.0     0.554529
5.0     0.654372
6.0     0.163845
7.0     0.682898
8.0     0.096544
9.0     0.190404
10.0    0.024131
11.0    0.504873
12.0    0.116399
13.0    0.118655
14.0    0.161913
15.0    0.178663
16.0    0.179810
17.0    0.152246
18.0    0.174277
19.0    0.087566
20.0    0.089087
21.0    0.141486
22.0    0.0

AMOUNT DISTRIBUTION ANALYSIS

In [None]:

print("\n=== Amount Distribution Analysis ===")
amount_stats = df.groupby('Class')['Amount'].describe()
print("\nTransaction Amount Statistics by Class:")
print(amount_stats)



=== Amount Distribution Analysis ===

Transaction Amount Statistics by Class:
          count        mean         std  min   25%    50%     75%       max
Class                                                                      
0      284315.0   88.291022  250.105092  0.0  5.65  22.00   77.05  25691.16
1         492.0  122.211321  256.683288  0.0  1.00   9.25  105.89   2125.87


 Correlation Matrix of Anonymous Features-
 To understand how the anonymized features are related to each other, which helps

1) Detect multicollinearity.
2) Guide dimensionality reduction or feature selection.

In [None]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    """Custom transformer to create new features"""
    
    def __init__(self):
        self.mean_amount = None
        self.std_amount = None
    
    def fit(self, X, y=None):
        if 'Amount' in X.columns:
            self.mean_amount = X['Amount'].mean()
            self.std_amount = X['Amount'].std()
        return self
    
    def transform(self, X):
        X_new = X.copy()
        
        if 'Time' in X_new.columns:
            X_new['Hour'] = X_new['Time'] // 3600 % 24
            X_new['Hour_sin'] = np.sin(2 * np.pi * X_new['Hour'] / 24)
            X_new['Hour_cos'] = np.cos(2 * np.pi * X_new['Hour'] / 24)

            X_new['IsMorning'] = ((X_new['Hour'] >= 6) & (X_new['Hour'] < 12)).astype(int)
            X_new['IsAfternoon'] = ((X_new['Hour'] >= 12) & (X_new['Hour'] < 18)).astype(int)
            X_new['IsEvening'] = ((X_new['Hour'] >= 18) & (X_new['Hour'] < 22)).astype(int)
            X_new['IsNight'] = ((X_new['Hour'] >= 22) | (X_new['Hour'] < 6)).astype(int)
            X_new = X_new.drop(['Time', 'Hour'], axis=1)
        
        if 'Amount' in X_new.columns:
            X_new['Amount_Log'] = np.log1p(X_new['Amount'])
            X_new['Amount_Zscore'] = (X_new['Amount'] - self.mean_amount) / self.std_amount

            X_new['IsSmallTxn'] = (X_new['Amount'] <= 5).astype(int)
            X_new['IsMediumTxn'] = ((X_new['Amount'] > 5) & (X_new['Amount'] <= 100)).astype(int)
            X_new['IsLargeTxn'] = (X_new['Amount'] > 100).astype(int)

            X_new = X_new.drop(['Amount'], axis=1)
        
        #  features engineering
        v_columns = [col for col in X_new.columns if col.startswith('V')]
        if v_columns:

            X_new['V_Sum'] = X_new[v_columns].sum(axis=1)
            X_new['V_Mean'] = X_new[v_columns].mean(axis=1)
            X_new['V_Std'] = X_new[v_columns].std(axis=1)
            X_new['V_Kurtosis'] = X_new[v_columns].kurtosis(axis=1)
            X_new['V_Skew'] = X_new[v_columns].skew(axis=1)
            
            # Feature ratios (using most important V features based on common findings)
            if all(col in X_new.columns for col in ['V1', 'V3', 'V4', 'V10', 'V11']):
                X_new['V1_to_V3'] = X_new['V1'] / (X_new['V3'] + 1e-8)
                X_new['V4_to_V10'] = X_new['V4'] / (X_new['V10'] + 1e-8)
                X_new['V11_to_V4'] = X_new['V11'] / (X_new['V4'] + 1e-8)
        
        return X_new


enhanced dataframe-

Applies the FeatureEngineer transformer to create new features, enhancing the dataset (df_enhanced).

Computes and visualizes the top 20 features with the strongest absolute correlations to the target variable 'Class' using a bar chart.

In [None]:

print("\n=== APPLYING FEATURE ENGINEERING ===")
feature_engineer = FeatureEngineer()
df_enhanced = feature_engineer.fit_transform(df)
if 'Class' in df_enhanced.columns:
    correlations = df_enhanced.corrwith(df_enhanced['Class']).sort_values(ascending=False)
    
    print("\nTop 20 Features Most Correlated with Fraud (absolute values):")
    top_features = correlations.drop('Class').abs().sort_values(ascending=False).head(20)
    print(top_features)



=== APPLYING FEATURE ENGINEERING ===

Top 20 Features Most Correlated with Fraud (absolute values):
V17       0.326481
V_Mean    0.316330
V_Sum     0.316330
V14       0.302544
V12       0.260593
V_Std     0.250839
V10       0.216883
V16       0.196539
V3        0.192961
V7        0.187257
V11       0.154876
V4        0.133447
V18       0.111485
V1        0.101347
V9        0.097733
V5        0.094974
V2        0.091289
V6        0.043643
V21       0.040413
V_Skew    0.035339
dtype: float64


DATA PREPARATION AND PREPROCESSING

In [29]:
# Split into features and target
X = df_enhanced.drop(['Class'], axis=1) if 'Class' in df_enhanced.columns else df_enhanced
y = df['Class'] if 'Class' in df else None

print(f"\nFeatures shape after engineering: {X.shape}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")


Features shape after engineering: (284807, 47)
Training set: (199364, 47), Test set: (85443, 47)


Define preprocessing pipeline with caching

In [None]:

from tempfile import mkdtemp
from joblib import Memory
cachedir = mkdtemp()
memory = Memory(location=cachedir, verbose=0)


Preprocessing Pipeline: Applies PowerTransformer (Yeo-Johnson method) to scale the training and test data (X_train, X_test).

PCA Transformation: Reduces dimensionality using PCA to retain 95% variance, creating new PCA features.

Combined Features: Combines the original preprocessed features with the PCA components, resulting in a new feature set.

In [46]:
# Preprocessing pipeline
preprocess_pipeline = Pipeline([
    ('scaler', PowerTransformer(method='yeo-johnson'))
])

# Apply preprocessing
print("\n=== APPLYING PREPROCESSING ===")
X_train_preprocessed = preprocess_pipeline.fit_transform(X_train)
X_test_preprocessed = preprocess_pipeline.transform(X_test)

# Add PCA features 
pca = PCA(n_components=0.95, random_state=42)  # Keep 95% of variance
X_train_pca = pca.fit_transform(X_train_preprocessed)
X_test_pca = pca.transform(X_test_preprocessed)

print(f"Number of PCA components: {pca.n_components_}")

# Combine original preprocessed features with PCA features
X_train_with_pca = np.hstack((X_train_preprocessed, X_train_pca))
X_test_with_pca = np.hstack((X_test_preprocessed, X_test_pca))

# Create feature names for PCA components
original_feature_names = list(X.columns)
pca_feature_names = [f'PCA_{i}' for i in range(pca.n_components_)]
all_feature_names = original_feature_names + pca_feature_names

print(f"Final feature count: {len(all_feature_names)}")


=== APPLYING PREPROCESSING ===
Number of PCA components: 35
Final feature count: 82


Handling class imbalance -

Techniques: None, SMOTE, ADASYN, SMOTETomek, Random Undersampling.

Performance evaluated using an XGBClassifier and metrics: Precision, Recall, F1 Score, PR-AUC, ROC-AUC.

Best Technique Selection:

The best resampling technique is selected based on the highest PR-AUC score.




In [None]:
# === Define resampling techniques ===
print("\n=== EVALUATING RESAMPLING TECHNIQUES ===")

resampling_techniques = {
    'None': None,
    'SMOTE': SMOTE(sampling_strategy=0.1, random_state=42),
    'ADASYN': ADASYN(sampling_strategy=0.1, random_state=42),
    'SMOTETomek': SMOTETomek(sampling_strategy=0.1, random_state=42),
    'Random Undersampling': RandomUnderSampler(sampling_strategy=0.5, random_state=42)
}

def evaluate_resampler(name, resampler, X_train, y_train, X_test, y_test):
    """Evaluate different resampling techniques"""
    print(f"\nEvaluating: {name}")
    
    if resampler is not None:
        X_res, y_res = resampler.fit_resample(X_train, y_train)
        unique, counts = np.unique(y_res, return_counts=True)
        print(f"Class distribution after resampling: {dict(zip(unique, counts))}")
        print(f"Ratio (Fraud/Normal): {counts[1]/counts[0]:.4f}")
    else:
        X_res, y_res = X_train, y_train
        unique, counts = np.unique(y_res, return_counts=True)
        print(f"Original class distribution: {dict(zip(unique, counts))}")
        print(f"Ratio (Fraud/Normal): {counts[1]/counts[0]:.4f}")

    # Train a quick model to evaluate
    clf = XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=1,
        random_state=42,
        n_jobs=4,
        early_stopping_rounds=10
    )

    # Split into train/validation
    X_res_train, X_res_val, y_res_train, y_res_val = train_test_split(
        X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
    )

    # Train model
    clf.fit(
        X_res_train, y_res_train,
        eval_set=[(X_res_val, y_res_val)],
        verbose=0
    )

    # Evaluate on test set
    y_pred_proba = clf.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba > 0.5).astype(int)

    # Calculate metrics
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    pr_auc = average_precision_score(y_test, y_pred_proba)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    print(f"Test metrics - Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    print(f"PR-AUC: {pr_auc:.4f}, ROC-AUC: {roc_auc:.4f}")

    # Garbage collection
    del clf, X_res_train, X_res_val, y_res_train, y_res_val
    gc.collect()

    return name, (f1, pr_auc, precision, recall, roc_auc), (X_res, y_res)


resampling_results = {}
resampled_data = {}

for name, resampler in resampling_techniques.items():
    name, metrics, data = evaluate_resampler(
        name, resampler, X_train_with_pca, y_train, X_test_with_pca, y_test
    )
    resampling_results[name] = metrics
    resampled_data[name] = data

# === Create results DataFrame ===
resampling_df = pd.DataFrame(columns=['Technique', 'F1 Score', 'PR-AUC', 'Precision', 'Recall', 'ROC-AUC'])

for name, metrics in resampling_results.items():
    resampling_df = pd.concat([resampling_df, pd.DataFrame({
        'Technique': [name],
        'F1 Score': [metrics[0]],
        'PR-AUC': [metrics[1]],
        'Precision': [metrics[2]],
        'Recall': [metrics[3]],
        'ROC-AUC': [metrics[4]]
    })], ignore_index=True)

print("\n=== RESAMPLING TECHNIQUES COMPARISON ===")
print(resampling_df.sort_values('PR-AUC', ascending=False))

# === Select best method by PR-AUC ===
best_resampling = resampling_df.loc[resampling_df['PR-AUC'].idxmax(), 'Technique']
print(f"\nBest resampling technique: {best_resampling} with PR-AUC of {resampling_df['PR-AUC'].max():.4f}")

# Get resampled data for best technique
X_train_resampled, y_train_resampled = resampled_data[best_resampling]


=== EVALUATING RESAMPLING TECHNIQUES ===

Evaluating: None
Original class distribution: {0: 199020, 1: 344}
Ratio (Fraud/Normal): 0.0017
Test metrics - Precision: 0.9576, Recall: 0.7635, F1: 0.8496
PR-AUC: 0.8304, ROC-AUC: 0.9712

Evaluating: SMOTE
Class distribution after resampling: {0: 199020, 1: 19902}
Ratio (Fraud/Normal): 0.1000
Test metrics - Precision: 0.7756, Recall: 0.8176, F1: 0.7961
PR-AUC: 0.8253, ROC-AUC: 0.9705

Evaluating: ADASYN
Class distribution after resampling: {0: 199020, 1: 19894}
Ratio (Fraud/Normal): 0.1000
Test metrics - Precision: 0.6780, Recall: 0.8108, F1: 0.7385
PR-AUC: 0.7983, ROC-AUC: 0.9715

Evaluating: SMOTETomek
Class distribution after resampling: {0: 199020, 1: 19902}
Ratio (Fraud/Normal): 0.1000
Test metrics - Precision: 0.7756, Recall: 0.8176, F1: 0.7961
PR-AUC: 0.8253, ROC-AUC: 0.9705

Evaluating: Random Undersampling
Class distribution after resampling: {0: 688, 1: 344}
Ratio (Fraud/Normal): 0.5000
Test metrics - Precision: 0.1148, Recall: 0.85

 MODEL TUNING & OPTIMIZATION-

 Models Optimized: XGBoost, LightGBM, and CatBoost.

Objective: Maximizing PR-AUC (Precision-Recall AUC) for each model.

Trials: 35 trials per model to optimize hyperparameters efficiently.

XGBoost: Optimizes parameters like max_depth, learning_rate, min_child_weight, subsample, etc., using cross-validation and early stopping.

LightGBM: Focuses on parameters such as num_leaves, learning_rate, max_depth, subsample, etc.

CatBoost: Tunes parameters like iterations, depth, learning_rate, l2_leaf_reg, etc.



In [None]:

import lightgbm as lgb
import sys

print("\n=== STARTING MODEL OPTIMIZATION ===")

# Force conversion to numpy arrays before optimization
def prepare_data(X, y):
    """Convert data to numpy arrays to avoid indexing issues"""
    if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
        X = X.values
    if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
        y = y.values
    return X, y


n_trials = 35  

def objective_xgb(trial):
    """Objective function for XGBoost optimization using lower-level XGBoost API"""
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'eta': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 20.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),
        'objective': 'binary:logistic',
        'eval_metric': 'aucpr',
        'seed': 42
    }
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    
    try:
        # Get data as numpy arrays
        X_data, y_data = prepare_data(X_train_resampled, y_train_resampled)
        
        # Cross-validation
        scores = []
        
        # 3 FOLDS CROSS VALIDATIONs
        kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        
        for train_index, val_index in kf.split(X_data, y_data):
            X_t, X_v = X_data[train_index], X_data[val_index]
            y_t, y_v = y_data[train_index], y_data[val_index]
            
            try:
                dtrain = xgb.DMatrix(X_t, label=y_t)
                dval = xgb.DMatrix(X_v, label=y_v)
                
                # Train model with early stopping
                model = xgb.train(
                    param,
                    dtrain,
                    num_boost_round=n_estimators,
                    evals=[(dval, 'validation')],
                    early_stopping_rounds=10,
                    verbose_eval=False
                )
                
                # Make predictions
                y_pred = model.predict(dval)
                score = average_precision_score(y_v, y_pred)
                scores.append(score)
            except Exception as e:
                print(f"Error during XGBoost training fold: {e}")
                continue
        
        if not scores:
            return 0.0
        
        return np.mean(scores)
    except Exception as e:
        print(f"Error in XGBoost objective function: {e}")
        return 0.0

def objective_lgbm(trial):
    """Objective function for LightGBM optimization"""
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 15, 60),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 30),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.1),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 0.1, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'class_weight': 'balanced',
        'random_state': 42,
        'n_jobs': 4,
        'verbose': -1
    }
    
    try:
        # Get data as numpy arrays to avoid indexing issues
        X_data, y_data = prepare_data(X_train_resampled, y_train_resampled)
        
        # 3 FOLD Cross-validation
        scores = []
        kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        
        for train_index, val_index in kf.split(X_data, y_data):
            X_t, X_v = X_data[train_index], X_data[val_index]
            y_t, y_v = y_data[train_index], y_data[val_index]
            
            try:
                model = LGBMClassifier(**param)
                model.fit(
                    X_t, y_t,
                    eval_set=[(X_v, y_v)],
                    eval_metric='auc', 
                    callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)]
                )
                
                y_pred = model.predict_proba(X_v)[:, 1]
                score = average_precision_score(y_v, y_pred)
                scores.append(score)
            except Exception as e:
                print(f"Error during LightGBM training fold: {e}")
                continue
        
        if not scores:
            return 0.0  
        return np.mean(scores)
    except Exception as e:
        print(f"Error in LightGBM objective function: {e}")
        return 0.0


def objective_catboost(trial):
    """Objective function for CatBoost optimization"""
    param = {
        'iterations': trial.suggest_int('iterations', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'depth': trial.suggest_int('depth', 4, 9),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 128),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 20.0),
        'random_seed': 42,
        'thread_count': 4,
        'verbose': 0
    }
    
    try:
        # Get data as numpy arrays to avoid indexing issues
        X_data, y_data = prepare_data(X_train_resampled, y_train_resampled)
        
        # 3 FOLD Cross-validation
        scores = []
        kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        
        for train_index, val_index in kf.split(X_data, y_data):
            X_t, X_v = X_data[train_index], X_data[val_index]
            y_t, y_v = y_data[train_index], y_data[val_index]
            
            try:
                model = CatBoostClassifier(**param)
                model.fit(
                    X_t, y_t,
                    eval_set=[(X_v, y_v)],
                    early_stopping_rounds=10,
                    verbose=0
                )
                
                y_pred = model.predict_proba(X_v)[:, 1]
                score = average_precision_score(y_v, y_pred)
                scores.append(score)
            except Exception as e:
                print(f"Error during CatBoost training fold: {e}")
                continue
        
        if not scores:
            return 0.0  
        return np.mean(scores)
    except Exception as e:
        print(f"Error in CatBoost objective function: {e}")
        return 0.0

try:
    X_train_resampled_shape = X_train_resampled.shape
    y_train_resampled_shape = y_train_resampled.shape
    print(f"Data loaded successfully. X shape: {X_train_resampled_shape}, y shape: {y_train_resampled_shape}")
except NameError:
    print("\nERROR: X_train_resampled and/or y_train_resampled not found.")
    print("Please define these variables before running the optimization.")
    print("Exiting...")
    sys.exit(1)

# Using Optuna for hyperparameter optimization
print("\nOptimizing XGBoost hyperparameters...")
study_xgb = optuna.create_study(direction='maximize', 
                              pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study_xgb.optimize(objective_xgb, n_trials=n_trials)

print("\nOptimizing LightGBM hyperparameters...")
study_lgbm = optuna.create_study(direction='maximize',
                               pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study_lgbm.optimize(objective_lgbm, n_trials=n_trials)

print("\nOptimizing CatBoost hyperparameters...")
study_catboost = optuna.create_study(direction='maximize',
                                   pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study_catboost.optimize(objective_catboost, n_trials=n_trials)

# Display best parameters
print("\n=== BEST HYPERPARAMETERS ===")
print("Best XGBoost parameters:", study_xgb.best_params)
print("Best XGBoost PR-AUC:", study_xgb.best_value)

print("\nBest LightGBM parameters:", study_lgbm.best_params)
print("Best LightGBM PR-AUC:", study_lgbm.best_value)

print("\nBest CatBoost parameters:", study_catboost.best_params)
print("Best CatBoost PR-AUC:", study_catboost.best_value)

[I 2025-04-26 11:56:33,733] A new study created in memory with name: no-name-a0b347be-f4af-4eeb-99c2-3a5eba5f3f45



=== STARTING MODEL OPTIMIZATION ===
Data loaded successfully. X shape: (199364, 82), y shape: (199364,)

Optimizing XGBoost hyperparameters...


[I 2025-04-26 11:56:35,328] Trial 0 finished with value: 0.7809170080680178 and parameters: {'max_depth': 8, 'learning_rate': 0.03266840172964194, 'min_child_weight': 4, 'subsample': 0.9502102487642593, 'colsample_bytree': 0.7668468829054276, 'scale_pos_weight': 19.15354004030622, 'gamma': 3.476308254713722, 'reg_alpha': 3.65834066846962, 'reg_lambda': 4.319090585478796, 'n_estimators': 115}. Best is trial 0 with value: 0.7809170080680178.
[I 2025-04-26 11:56:37,766] Trial 1 finished with value: 0.8269750499896817 and parameters: {'max_depth': 7, 'learning_rate': 0.10976892362347737, 'min_child_weight': 1, 'subsample': 0.8915516033395161, 'colsample_bytree': 0.7213636873023949, 'scale_pos_weight': 7.417629894045873, 'gamma': 1.2780733840700176, 'reg_alpha': 4.756668486853968, 'reg_lambda': 3.0749487582805473, 'n_estimators': 111}. Best is trial 1 with value: 0.8269750499896817.
[I 2025-04-26 11:56:38,917] Trial 2 finished with value: 0.7632392469198009 and parameters: {'max_depth': 3, 


Optimizing LightGBM hyperparameters...


[I 2025-04-26 11:57:24,722] Trial 0 finished with value: 0.46764122551741294 and parameters: {'n_estimators': 224, 'learning_rate': 0.0654785675746326, 'num_leaves': 57, 'max_depth': 8, 'min_child_samples': 27, 'subsample': 0.889338595340035, 'colsample_bytree': 0.6802681825524103, 'min_split_gain': 0.029981367940134687, 'min_child_weight': 0.030229440867681603, 'reg_alpha': 0.46979031341775224, 'reg_lambda': 0.43657966131130777}. Best is trial 0 with value: 0.46764122551741294.
[I 2025-04-26 11:57:26,538] Trial 1 finished with value: 0.6325190966508335 and parameters: {'n_estimators': 105, 'learning_rate': 0.06633013094703986, 'num_leaves': 53, 'max_depth': 7, 'min_child_samples': 17, 'subsample': 0.7055047975567528, 'colsample_bytree': 0.9625396216724755, 'min_split_gain': 0.05566584525658741, 'min_child_weight': 0.004462513036113901, 'reg_alpha': 0.1474693855590381, 'reg_lambda': 0.6255933052437673}. Best is trial 1 with value: 0.6325190966508335.
[I 2025-04-26 11:57:28,479] Trial 2


Optimizing CatBoost hyperparameters...


[I 2025-04-26 11:58:19,862] Trial 0 finished with value: 0.8493955228090125 and parameters: {'iterations': 101, 'learning_rate': 0.03328174939123163, 'depth': 9, 'l2_leaf_reg': 6.142078734395756, 'border_count': 85, 'scale_pos_weight': 12.594178404485636}. Best is trial 0 with value: 0.8493955228090125.
[I 2025-04-26 11:58:36,059] Trial 1 finished with value: 0.814155457682486 and parameters: {'iterations': 149, 'learning_rate': 0.011702042903440494, 'depth': 5, 'l2_leaf_reg': 8.975379465087688, 'border_count': 80, 'scale_pos_weight': 16.15654390665393}. Best is trial 0 with value: 0.8493955228090125.
[I 2025-04-26 11:58:53,542] Trial 2 finished with value: 0.8255009017160789 and parameters: {'iterations': 210, 'learning_rate': 0.0317196240293269, 'depth': 4, 'l2_leaf_reg': 9.467942986580788, 'border_count': 126, 'scale_pos_weight': 7.779203687716522}. Best is trial 0 with value: 0.8493955228090125.
[I 2025-04-26 11:59:11,366] Trial 3 finished with value: 0.8551644760029743 and paramet


=== BEST HYPERPARAMETERS ===
Best XGBoost parameters: {'max_depth': 9, 'learning_rate': 0.19204590754392462, 'min_child_weight': 8, 'subsample': 0.7395244220349267, 'colsample_bytree': 0.6106090410074912, 'scale_pos_weight': 1.5603241849262437, 'gamma': 0.5971932711844856, 'reg_alpha': 1.1856237564368213, 'reg_lambda': 4.949550158662442, 'n_estimators': 218}
Best XGBoost PR-AUC: 0.8615849128293896

Best LightGBM parameters: {'n_estimators': 168, 'learning_rate': 0.08289893549823593, 'num_leaves': 37, 'max_depth': 3, 'min_child_samples': 5, 'subsample': 0.9557813071628072, 'colsample_bytree': 0.6091969753173453, 'min_split_gain': 0.02228718089156831, 'min_child_weight': 0.018380831967552268, 'reg_alpha': 0.18939587437176686, 'reg_lambda': 0.9696893144321871}
Best LightGBM PR-AUC: 0.7598892009455182

Best CatBoost parameters: {'iterations': 279, 'learning_rate': 0.06466488234944177, 'depth': 7, 'l2_leaf_reg': 7.824705022736504, 'border_count': 47, 'scale_pos_weight': 1.0858456387998139}

 TRAINING FINAL MODELS

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import lightgbm as lgb

print("\n=== TRAINING FINAL MODELS ===")

# Create validation set
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_resampled, y_train_resampled, test_size=0.2, random_state=42, stratify=y_train_resampled
)

# XGBoost
xgb_params = study_xgb.best_params.copy()
n_estimators_xgb = xgb_params.pop('n_estimators', 300)

xgb_model = XGBClassifier(n_estimators=n_estimators_xgb, **xgb_params, n_jobs=4)
xgb_model.fit(X_train_final, y_train_final)

# LightGBM
lgbm_params = study_lgbm.best_params.copy()
n_estimators_lgbm = lgbm_params.pop('n_estimators', 300)
lgbm_model = LGBMClassifier(n_estimators=n_estimators_lgbm, **lgbm_params, n_jobs=4)
lgbm_model.fit(
    X_train_final, y_train_final,
    eval_set=[(X_val, y_val)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)]
)

#  CatBoost
catboost_params = study_catboost.best_params.copy()
iterations = catboost_params.pop('iterations', 300)
catboost_model = CatBoostClassifier(iterations=iterations, **catboost_params, thread_count=4)
catboost_model.fit(
    X_train_final, y_train_final,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=10,
    verbose=0
)



=== TRAINING FINAL MODELS ===


<catboost.core.CatBoostClassifier at 0x17f6f44d0>

MODEL EVALUATION

find_optimal_threshold: Finds the threshold that maximizes the F1 score using the precision-recall curve.

evaluate_model: Evaluates the model using several metrics including ROC AUC, PR AUC, Precision, Recall, F1 score, and confusion matrix.

XGBoost: Evaluated using the test data (X_test_with_pca, y_test), output includes metrics like ROC AUC, PR AUC, Precision, Recall, F1 score, and confusion matrix.

LightGBM: Same evaluation process as XGBoost.

CatBoost: Same evaluation process as XGBoost.

Ensemble Creation: A weighted average of model predictions (XGBoost, LightGBM, CatBoost) based on their validation PR AUC scores.

Ensemble Weights: Calculated based on validation PR AUC

Ensemble Metrics: The weighted ensemble is evaluated using the same metrics as individual models, including ROC AUC, PR AUC, Precision, Recall, F1 score, and confusion matrix.

Optimal Threshold for Ensemble: Calculated using the F1 score.

Ensemble Results: Printed for evaluation

In [None]:
print("\n=== EVALUATING MODELS ===")

# Function to find optimal threshold using F1 score
def find_optimal_threshold(y_true, y_score):
    """Find the optimal threshold that maximizes F1 score"""
    precision, recall, thresholds = precision_recall_curve(y_true, y_score)

    f1_scores = 2 * precision * recall / (precision + recall + 1e-7)
    if len(thresholds) < len(f1_scores):
        thresholds = np.append(thresholds, 1.0)
    return thresholds[np.argmax(f1_scores)], np.max(f1_scores)

# Function to evaluate and visualize model performance
def evaluate_model(model, X_test, y_test, model_name):
    """Evaluate model and return predictions and metrics"""
    y_prob = model.predict_proba(X_test)[:, 1]
    
    roc_auc = roc_auc_score(y_test, y_prob)
    pr_auc = average_precision_score(y_test, y_prob)
    
    # Finding optimal threshold
    optimal_threshold, best_f1 = find_optimal_threshold(y_test, y_prob)
    y_pred = (y_prob >= optimal_threshold).astype(int)
    
    # Calculation of  metrics
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"\n{model_name} Model Evaluation:")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"PR AUC: {pr_auc:.4f}")
    print(f"Optimal threshold: {optimal_threshold:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    

    cm = confusion_matrix(y_test, y_pred)
    
    # Return results
    return {
        'model_name': model_name,
        'y_prob': y_prob,
        'y_pred': y_pred,
        'optimal_threshold': optimal_threshold,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm
    }

# Evaluating  each model
xgb_results = evaluate_model(xgb_model, X_test_with_pca, y_test, "XGBoost")
lgbm_results = evaluate_model(lgbm_model, X_test_with_pca, y_test, "LightGBM")
catboost_results = evaluate_model(catboost_model, X_test_with_pca, y_test, "CatBoost")


# Calculating weights based on validation PR-AUC
xgb_val_probs = xgb_model.predict_proba(X_val)[:, 1]
lgbm_val_probs = lgbm_model.predict_proba(X_val)[:, 1]
catboost_val_probs = catboost_model.predict_proba(X_val)[:, 1]

xgb_val_pr_auc = average_precision_score(y_val, xgb_val_probs)
lgbm_val_pr_auc = average_precision_score(y_val, lgbm_val_probs)
catboost_val_pr_auc = average_precision_score(y_val, catboost_val_probs)

# Calculating weighted ensemble weights based on PR-AUC
total_pr_auc = xgb_val_pr_auc + lgbm_val_pr_auc + catboost_val_pr_auc
weights = [
    xgb_val_pr_auc / total_pr_auc,
    lgbm_val_pr_auc / total_pr_auc,
    catboost_val_pr_auc / total_pr_auc
]

print(f"\nEnsemble weights:")
print(f"XGBoost: {weights[0]:.4f}")
print(f"LightGBM: {weights[1]:.4f}")
print(f"CatBoost: {weights[2]:.4f}")

# Creating ensemble predictions
ensemble_probs = (
    weights[0] * xgb_results['y_prob'] + 
    weights[1] * lgbm_results['y_prob'] + 
    weights[2] * catboost_results['y_prob']
)

# Evaluating ensemble
ensemble_results = {
    'model_name': 'Weighted Ensemble',
    'y_prob': ensemble_probs
}

# Finding optimal threshold for ensemble
optimal_threshold, _ = find_optimal_threshold(y_test, ensemble_probs)
ensemble_results['optimal_threshold'] = optimal_threshold
ensemble_results['y_pred'] = (ensemble_probs >= optimal_threshold).astype(int)

# Calculating metrics for ensemble
ensemble_results['roc_auc'] = roc_auc_score(y_test, ensemble_probs)
ensemble_results['pr_auc'] = average_precision_score(y_test, ensemble_probs)
ensemble_results['precision'] = precision_score(y_test, ensemble_results['y_pred'])
ensemble_results['recall'] = recall_score(y_test, ensemble_results['y_pred'])
ensemble_results['f1'] = f1_score(y_test, ensemble_results['y_pred'])
ensemble_results['confusion_matrix'] = confusion_matrix(y_test, ensemble_results['y_pred'])

print(f"\nWeighted Ensemble Model Evaluation:")
print(f"ROC AUC: {ensemble_results['roc_auc']:.4f}")
print(f"PR AUC: {ensemble_results['pr_auc']:.4f}")
print(f"Optimal threshold: {ensemble_results['optimal_threshold']:.4f}")
print(f"Precision: {ensemble_results['precision']:.4f}")
print(f"Recall: {ensemble_results['recall']:.4f}")
print(f"F1 Score: {ensemble_results['f1']:.4f}")


=== EVALUATING MODELS ===

XGBoost Model Evaluation:
ROC AUC: 0.9779
PR AUC: 0.8282
Optimal threshold: 0.4526
Precision: 0.9328
Recall: 0.7500
F1 Score: 0.8315

LightGBM Model Evaluation:
ROC AUC: 0.9620
PR AUC: 0.7923
Optimal threshold: 0.5320
Precision: 0.9000
Recall: 0.7297
F1 Score: 0.8060

CatBoost Model Evaluation:
ROC AUC: 0.9774
PR AUC: 0.8208
Optimal threshold: 0.2302
Precision: 0.9084
Recall: 0.8041
F1 Score: 0.8530

Ensemble weights:
XGBoost: 0.3407
LightGBM: 0.3164
CatBoost: 0.3429

Weighted Ensemble Model Evaluation:
ROC AUC: 0.9799
PR AUC: 0.8296
Optimal threshold: 0.3521
Precision: 0.9062
Recall: 0.7838
F1 Score: 0.8406


In [None]:
import pandas as pd


results = [
    {'Model': 'XGBoost', 'PR AUC': 0.8282, 'Threshold': 0.4526},
    {'Model': 'LightGBM', 'PR AUC': 0.7923, 'Threshold': 0.5320},
    {'Model': 'CatBoost', 'PR AUC': 0.8208, 'Threshold': 0.2302},
    {'Model': 'Weighted Ensemble', 'PR AUC': 0.8296, 'Threshold': 0.3521}
]

# Create the DataFrame and sort by PR AUC descending
results_df = pd.DataFrame(results).sort_values(by='PR AUC', ascending=False).reset_index(drop=True)


In [68]:
print("\n=== SAVING THE BEST MODEL ===")

import pickle

# Identify the best model based on PR-AUC
best_model_name = results_df.iloc[0]['Model']
print(f"Best model based on PR-AUC: {best_model_name}")

# Save the corresponding model
best_model = None
if best_model_name == 'XGBoost':
    best_model = xgb_model
elif best_model_name == 'LightGBM':
    best_model = lgbm_model
elif best_model_name == 'CatBoost':
    best_model = catboost_model
elif best_model_name == 'Weighted Ensemble':
    # For ensemble, save all component models and weights
    with open('ensemble_models.pkl', 'wb') as f:
        pickle.dump({
            'xgb_model': xgb_model,
            'lgbm_model': lgbm_model,
            'catboost_model': catboost_model,
            'weights': weights,
            'threshold': ensemble_results['optimal_threshold'],
            'feature_engineer': feature_engineer,
            'preprocess_pipeline': preprocess_pipeline,
            'pca': pca
        }, f)
    print("Ensemble model saved as 'ensemble_models.pkl'")
    best_model = 'ensemble'

# If a single model was best
if best_model != 'ensemble' and best_model is not None:
    with open(f'{best_model_name.lower()}_model.pkl', 'wb') as f:
        pickle.dump({
            'model': best_model,
            'threshold': results_df.iloc[0]['Threshold'],
            'feature_engineer': feature_engineer,
            'preprocess_pipeline': preprocess_pipeline,
            'pca': pca
        }, f)
    print(f"{best_model_name} model saved as '{best_model_name.lower()}_model.pkl'")



=== SAVING THE BEST MODEL ===
Best model based on PR-AUC: Weighted Ensemble
Ensemble model saved as 'ensemble_models.pkl'
