In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from evaluation import evaluate_model
from preprocessing import *
from sklearn import preprocessing
import xgboost as xgb
import gc

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

sns.set_theme(
    style="whitegrid",       # Background style ("whitegrid", "darkgrid", etc.)
    palette="deep",          # Default color palette ("deep", "muted", "bright", etc.)
    font="sans-serif",       # Font family
    font_scale=1.1,          # Scale font size slightly
    rc={"figure.figsize": (8, 5)}  # Default figure size
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_path = Path("../datasets")

train_identity = pd.read_csv(dataset_path / "train_identity.csv")
train_tx = pd.read_csv(dataset_path / "train_transaction.csv")

# test_identity = pd.read_csv(dataset_path / "test_identity.csv")
# test_tx = pd.read_csv(dataset_path / "test_transaction.csv")

In [3]:
train_all_cols = pd.merge(train_tx, train_identity, on='TransactionID', how='left')
# test_all_cols = pd.merge(train_tx, train_identity, on='TransactionID', how='left')

X =  train_all_cols.drop(columns=['isFraud'])
y = train_all_cols['isFraud']

In [4]:
print(f"X_shape: {X.shape}")

X_shape: (590540, 433)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [6]:
# X_train, X_val, X_test = run_feature_engineering(X_train, X_val, X_test)

In [7]:
A

Dropping columns: ['V118', 'V123', 'id_22', 'V114', 'V300', 'V119', 'id_25', 'id_07', 'V109', 'V117', 'TransactionDT', 'V120', 'V305', 'id_21', 'V122', 'id_27', 'id_26', 'id_08', 'V311', 'V108', 'C3', 'V110', 'V116', 'V112', 'V107', 'id_24', 'id_23', 'V111', 'V121', 'V115', 'V286', 'V113', 'V301']
Train shape: (472432, 400)
Val shape: (118108, 400)


In [8]:
# X_train, X_val, X_test = run_feature_engineering(X_train, X_val, X_test)

## XGBoost without Cross-Validation

In [9]:
scale_pos_weight = sum(y_train == 0) / sum(y_train == 1)

model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=17,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.9,
    tree_method='hist',
    use_label_encoder=False,
    eval_metric='auc',
    missing=-999,
    random_state=42,
    scale_pos_weight=scale_pos_weight,
)

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [10]:
from sklearn.metrics import f1_score, recall_score, precision_score,accuracy_score
# y_pred = model.predict(X_test)
y_pred = model.predict(X_test)
print("F1 Score: ", f1_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Accuracy: ", accuracy_score(y_test, y_pred))

F1 Score:  0.810010764262648
Recall:  0.7282845390757319
Precision:  0.9123976962715975
Accuracy:  0.9880448403156433


## XGBoost with K Cross Validation

In [11]:
from xgboost import XGBClassifier
from cross_validation import cross_validate_model

model = XGBClassifier(
    n_estimators=500,
    max_depth=17,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.9,
    tree_method='hist',
    use_label_encoder=False,
    eval_metric='auc',
    random_state=42,
    scale_pos_weight=sum(y_train == 0) / sum(y_train == 1)
)

cross_validate_model(model, X_train, y_train, X_test, y_test, epochs=4)


🚀 Fold 1/4


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 1 - Recall: 0.6909486931268151
Fold 1 - Precision: 0.8989294710327456
Fold 1 - F1 Score: 0.7813355227148331

🚀 Fold 2/4


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 2 - Recall: 0.6839303000968054
Fold 2 - Precision: 0.9127906976744186
Fold 2 - F1 Score: 0.7819590481460985

🚀 Fold 3/4


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 3 - Recall: 0.6963464795548028
Fold 3 - Precision: 0.9002189552705662
Fold 3 - F1 Score: 0.7852660300136426

🚀 Fold 4/4


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 4 - Recall: 0.6861843697072345
Fold 4 - Precision: 0.9011757229107086
Fold 4 - F1 Score: 0.7791208791208791

📊 Final OOF Evaluation:
OOF Recall: 0.6893526920750151
OOF Precision: 0.9032181357006975
OOF F1 Score: 0.7819254786248542

🧪 Final Test Evaluation:
Test Recall: 0.7067505443987419
Test Precision: 0.9197103274559194
Test F1 Score: 0.7992885483650294
Test ROC AUC: 0.9721155894220609


## Searching best combination of Feature Engineering Pipeline 

In [12]:
config = {
    'drop_low_information_columns': True,
    'drop_transaction_dt': True,
    'encode_categorical_columns': True,
    'fill_missing_values': True,
    'create_transaction_amount_ratios': False,
    'group_rare_categories': False,
    'create_time_features': False,
    'drop_unused_columns': True,
    'log_transform_transaction_amt': False
}


X_train_processed = run_feature_engineering_single_df(X_train.copy(), config)
X_test_processed = run_feature_engineering_single_df(X_test.copy(), config)

f1 = evaluate_model(X_train_processed, X_test_processed, model, y_train, y_test)
print(f"✅ F1-score: {f1}")

🚧 Starting feature engineering pipeline...

✅ Low-information columns dropped
✅ Data cleaned
✅ Categorical columns encoded
✅ Missing values filled
✅ Unused columns dropped
✅ Numeric features standardized
🎯 Final shape: (472432, 397)
🚧 Starting feature engineering pipeline...

✅ Low-information columns dropped
✅ Data cleaned
✅ Categorical columns encoded
✅ Missing values filled
✅ Unused columns dropped
✅ Numeric features standardized
🎯 Final shape: (118108, 397)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


KeyboardInterrupt: 