In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from evaluation import evaluate_model
from preprocessing import *
from sklearn import preprocessing
import xgboost as xgb
import gc

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

sns.set_theme(
    style="whitegrid",       # Background style ("whitegrid", "darkgrid", etc.)
    palette="deep",          # Default color palette ("deep", "muted", "bright", etc.)
    font="sans-serif",       # Font family
    font_scale=1.1,          # Scale font size slightly
    rc={"figure.figsize": (8, 5)}  # Default figure size
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_path = Path("../datasets")

train_identity = pd.read_csv(dataset_path / "train_identity.csv")
train_tx = pd.read_csv(dataset_path / "train_transaction.csv")

# test_identity = pd.read_csv(dataset_path / "test_identity.csv")
# test_tx = pd.read_csv(dataset_path / "test_transaction.csv")

In [3]:
train_all_cols = pd.merge(train_tx, train_identity, on='TransactionID', how='left')
# test_all_cols = pd.merge(train_tx, train_identity, on='TransactionID', how='left')

X =  train_all_cols.drop(columns=['isFraud'])
y = train_all_cols['isFraud']

In [4]:
print(f"X_shape: {X.shape}")

X_shape: (590540, 433)


In [5]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=42
)

In [6]:
# X_train, X_val, X_test = run_feature_engineering(X_train, X_val, X_test)

In [7]:
high_missing_cols = [col for col in X_train.columns if X_train[col].isnull().sum() / X_train.shape[0] > 0.96]
high_missing_cols_X_test = [col for col in X_test.columns if X_test[col].isnull().sum() / X_test.shape[0] > 0.96]

big_top_value_cols = [col for col in X_train.columns if X_train[col].value_counts(dropna=False, normalize=True).values[0] > 0.96]
big_top_value_cols_X_test = [col for col in X_test.columns if X_test[col].value_counts(dropna=False, normalize=True).values[0] > 0.96]

cols_to_drop = list(set(high_missing_cols + high_missing_cols_X_test + big_top_value_cols + big_top_value_cols_X_test ))
len(cols_to_drop)
print(cols_to_drop)


X_train = X_train.drop(cols_to_drop, axis=1)
X_test = X_test.drop(cols_to_drop, axis=1)


X_train.drop('TransactionDT', axis=1, inplace=True)
X_test.drop('TransactionDT', axis=1, inplace=True)

print(X_train.shape)
print(X_test.shape)

# Label Encoding
for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values)) 
        
X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

['id_26', 'V305', 'V108', 'V113', 'V111', 'V118', 'V122', 'id_23', 'V123', 'V300', 'id_08', 'V115', 'V110', 'id_22', 'V107', 'V116', 'V311', 'V286', 'id_21', 'C3', 'V109', 'V117', 'id_27', 'V112', 'V120', 'id_07', 'V119', 'V121', 'id_25', 'id_24', 'V301', 'V114']
(377945, 400)
(118108, 400)


In [8]:
# from imblearn.over_sampling import KMeansSMOTE

# print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
# print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

# sm = KMeansSMOTE(random_state=99, sampling_strategy = 0.15,  k_neighbors = 10,cluster_balance_threshold = 0.02, n_jobs=4)
# X_train_new, y_train_new = sm.fit_resample(X_train, y_train.ravel())

# X_train_new = pd.DataFrame(X_train_new)
# X_train_new.columns = X_train.columns
# y_train_new = pd.DataFrame(y_train_new)

# print('After OverSampling, the shape of X_train_new: {}'.format(X_train_new.shape))
# print('After OverSampling, the shape of y_train_new: {} \n'.format(y_train_new.shape))

# print("After OverSampling, counts of label '1': {}".format(sum(y_train==1)))
# print("After OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

In [9]:
# from sklearn.model_selection import StratifiedKFold
# import xgboost as xgb
# import numpy as np
# import gc

# EPOCHS = 4
# kf = StratifiedKFold(n_splits=EPOCHS, random_state=99, shuffle=True)

# y_preds = np.zeros(X_test.shape[0])
# y_oof = np.zeros(X_train_new.shape[0])

# for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train_new, y_train_new)):
#     print(f"\n🚀 Fold {fold + 1}/{EPOCHS}")
    
#     X_tr, X_val = X_train_new.iloc[tr_idx], X_train_new.iloc[val_idx]
#     y_tr, y_val = y_train_new.iloc[tr_idx], y_train_new.iloc[val_idx]

#     model = xgb.XGBClassifier(
#         n_estimators=500,
#         max_depth=17,
#         learning_rate=0.03,
#         subsample=0.9,
#         colsample_bytree=0.9,
#         tree_method='hist',
#         use_label_encoder=False,
#         eval_metric='auc',
#         missing=-999,
#         random_state=42
#     )

#     model.fit(X_tr, y_tr)
#     val_preds = model.predict_proba(X_val)[:, 1]
#     test_preds = model.predict_proba(X_test)[:, 1]

#     y_oof[val_idx] = val_preds
#     y_preds += test_preds / EPOCHS

#     evaluate_model(
#         model,
#         X_val,
#         y_val,
#         X_test,
#         y_test,
#         threshold=0.5,
#         model_name="XGBoost",
#         is_lightgbm=False
#     )

# print("\n📊 Final OOF Evaluation:")
# evaluate_model(
#     model,
#     X_val,
#     y_val,
#     X_test,
#     y_test,
#     threshold=0.5,
#     model_name="XGBoost",
#     is_lightgbm=False
# )
# del X_train_new
# gc.collect()


In [10]:
# X_train, X_val, X_test = run_feature_engineering(X_train, X_val, X_test)

In [None]:
model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=17,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.9,
    tree_method='hist',
    use_label_encoder=False,
    eval_metric='auc',
    missing=-999,
    random_state=42
)

model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:ProductCD: object, card4: object, card6: object, P_emaildomain: object, R_emaildomain: object, M1: object, M2: object, M3: object, M4: object, M5: object, M6: object, M7: object, M8: object, M9: object, id_12: object, id_15: object, id_16: object, id_23: object, id_27: object, id_28: object, id_29: object, id_30: object, id_31: object, id_33: object, id_34: object, id_35: object, id_36: object, id_37: object, id_38: object, DeviceType: object, DeviceInfo: object

In [13]:
# Step 1: Identify high missing columns and big top value columns in X_val
high_missing_cols_X_val = [col for col in X_val.columns if X_val[col].isnull().sum() / X_val.shape[0] > 0.96]
big_top_value_cols_X_val = [col for col in X_val.columns if X_val[col].value_counts(dropna=False, normalize=True).values[0] > 0.96]

# Step 2: Combine columns to drop for X_val (same logic as X_train and X_test)
cols_to_drop_X_val = list(set(high_missing_cols_X_val + big_top_value_cols_X_val))
print(cols_to_drop_X_val)

# Step 3: Drop identified columns from X_val
X_val = X_val.drop(cols_to_drop_X_val, axis=1)

# Step 4: Drop 'TransactionDT' column from X_val
X_val.drop('TransactionDT', axis=1, inplace=True)

# Step 5: Print the shape of X_val after changes
print(X_val.shape)

# Step 6: Label Encoding for categorical columns in X_val (same as X_train and X_test)
from sklearn import preprocessing

for f in X_val.columns:
    if X_val[f].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_val[f].values))  # Fit only on X_val
        X_val[f] = lbl.transform(list(X_val[f].values))

# Step 7: Fill missing values in X_val (same as X_train and X_test)
X_val = X_val.fillna(-999)

# Print the shape of X_val after all transformations
print(X_val.shape)


['id_26', 'V305', 'V108', 'V113', 'V111', 'V118', 'V122', 'id_23', 'V123', 'V300', 'id_08', 'V115', 'V110', 'id_22', 'V107', 'V116', 'V311', 'V286', 'id_21', 'C3', 'V109', 'V117', 'id_27', 'V112', 'V120', 'id_07', 'V119', 'V121', 'id_25', 'id_24', 'V301', 'V114']
(94487, 400)
(94487, 400)


In [16]:
y_pred = model.predict(X_test)
print("F1 Score: ", f1_score(y_test, y_pred))

F1 Score:  0.7372034024772423


In [None]:
# config = {
#     'drop_low_information_columns': True,
#     'drop_transaction_dt': True,
#     'encode_categorical_columns': True,
#     'fill_missing_values': True,
#     'create_transaction_amount_ratios': False,
#     'group_rare_categories': False,
#     'create_time_features': False,
#     'drop_unused_columns': True,
#     'log_transform_transaction_amt': False
# }


# X_train_processed = run_feature_engineering_single_df(X_train.copy(), config)
# X_val_processed = run_feature_engineering_single_df(X_val.copy(), config)

# f1 = evaluate_model(X_train_processed, X_val_processed, model, y_train, y_val)
# print(f"✅ F1-score: {f1}")

🚧 Starting feature engineering pipeline...

✅ Low-information columns dropped
✅ TransactionDT dropped
✅ Data cleaned
✅ Categorical columns encoded
✅ Missing values filled
✅ Unused columns dropped
🎯 Final shape: (377945, 397)
🚧 Starting feature engineering pipeline...

✅ Low-information columns dropped
✅ TransactionDT dropped
✅ Data cleaned
✅ Categorical columns encoded
✅ Missing values filled
✅ Unused columns dropped
🎯 Final shape: (94487, 397)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ F1-score: 0.6727236816116927


In [None]:
best_config = search_best_config(X_train, X_val, model, y_train, y_val)

🔍 Evaluating config: {'create_transaction_amount_ratios': False, 'clean_data': True, 'group_rare_categories': False, 'encode_categorical_columns': True, 'fill_missing_values': True, 'create_time_features': False, 'drop_unused_columns': False, 'log_transform_transaction_amt': True}
🚧 Starting feature engineering pipeline...

✅ Low-information columns dropped
✅ TransactionDT dropped
✅ Data cleaned
✅ Categorical columns encoded
✅ Missing values filled
✅ Log transformation applied to TransactionAmt
🎯 Final shape: (377945, 401)


  X['TransactionAmt_log'] = np.log1p(X['TransactionAmt'])


🚧 Starting feature engineering pipeline...

✅ Low-information columns dropped
✅ TransactionDT dropped
