In [1]:
import numpy as np 
import pandas as pd 
import sys
import os
import optuna
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score
import matplotlib.pylab as plt
import warnings
from scipy.stats import skew, kurtosis
from datetime import datetime
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize']=10,20

# Add the grandparent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../')))
from Utils import FE_helper as FE
from Utils import training_models as TM
from tqdm import tqdm 
import json

  from .autonotebook import tqdm as notebook_tqdm


# Importing Data

In [2]:
# 2. Load the data
train_df = pd.read_csv('../Original_Data/train_2025.csv') 
test_df = pd.read_csv('../Original_Data/test_2025.csv')

train_df = FE.add_features(train_df)
test_df = FE.add_features(test_df)

test_id = test_df['claim_number']
train_id = train_df['claim_number']
target = train_df['fraud']

ignore_var = ['claim_date.is_weekend', 'claim_date.near_holiday', 'fraud']
train_df = FE.drop_ignored_columns(train_df, ignore_var)
test_df = FE.drop_ignored_columns(test_df, ignore_var)


# Preprocessing Data. Training and Testing Data Needs To Be Fully Numerical Before Proceeding.

In [3]:
updated_train_df = train_df
updated_test_df = test_df

high_dim_cat_cols_to_drop = ['claim_date.day', 'claim_date.dayofweek', 'claim_date.weekofyear', 'claim_date.month']
updated_train_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True, errors='ignore')
updated_test_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True, errors='ignore')

# Step 1: Fit on training data
onehot, scaler, cat_cols, num_cols = FE.fit_regular_transformer(updated_train_df, '_count')

# Step 2: Transform training set itself
X_train_regular = FE.transform_regular_set(updated_train_df, onehot, scaler, cat_cols, num_cols)

# Step 3: Transform test set (call the same function on test_df)
X_test_regular = FE.transform_regular_set(updated_test_df, onehot, scaler, cat_cols, num_cols)


# Importing Model Parameters

In [43]:
models_list = ['lgb', 'xgb', 'cat', 'hgb']
output_dir = f"../Records/{'_'.join(models_list)}_temp"


lgb_params, lgb_threshold = TM.read_settings('../Records/lgb_36096/param_lgb_temp.json')
xgb_params, xgb_threshold = TM.read_settings('../Records/xgb_36846/param_xgb_temp.json')
cat_params, cat_threshold = TM.read_settings('../Records/cat_37242/param_cat_temp.json')
hgb_params, hgb_threshold = TM.read_settings('../Records/hgb_temp/param_hgb_temp.json')

param_dict = {
    'lgb': {
        'params': lgb_params,
        'threshold': lgb_threshold
    },
    'xgb': {
        'params': xgb_params,
        'threshold': xgb_threshold
    },
    'cat': {
        'params': cat_params,
        'threshold': cat_threshold
    },
    'hgb': {
        'params': hgb_params,
        'threshold': hgb_threshold
    }
}

In [16]:
def run_stacking_oof_predictions(X, y, params, train_model_fn, kfoldcv=5, drop=[], predict_fn=None, seed=None, verbose = False):
    skf = StratifiedKFold(n_splits=kfoldcv, shuffle=True, random_state=seed)

    oof_preds = np.zeros(len(X))
    oof_thresholds = np.zeros(kfoldcv)
    model_list = []
    f1_scores = []

    i = 1
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train = X.iloc[train_idx].drop(columns=drop, errors='ignore')
        X_val = X.iloc[val_idx].drop(columns=drop, errors='ignore')
        y_train = y.iloc[train_idx]
        y_val = y.iloc[val_idx]

        _, model, stats = train_model_fn(X_train, y_train, X_val, y_val, params)
        model_list.append(model)
        f1_scores.append(stats['f1'])
        oof_thresholds[fold_idx] = stats['threshold']

        # Predict on validation fold
        probs_val = predict_fn(model, X_val)
        oof_preds[val_idx] = probs_val  # assign in correct positions
        
        if verbose:
            pct_progress = (i / kfoldcv)*100
            sys.stdout.write(f'\rCurrently at {pct_progress: .2f}%      ')
            sys.stdout.flush()
        i += 1

    return oof_preds, model_list, f1_scores, oof_thresholds


def collect_oof_preds(models_list, param_dict, kfoldcv = 10, seed = None, verbose = False):
    
    oof_preds_list = []
    
    for model_name in models_list:
        if verbose:
            print(f'\nWorking on Model {model_name}')
            
        train_model_fn = getattr(TM, f"train_{model_name}", None)
        predict_fn = getattr(TM, f"predict_{model_name}", None)
        params = param_dict[model_name]['params']
        model_oof_preds , _, _, _ = run_stacking_oof_predictions(X=X_train_regular,
                                                                 y=target, 
                                                                 params=params,
                                                                 train_model_fn=train_model_fn, 
                                                                 kfoldcv=kfoldcv,
                                                                 predict_fn=predict_fn,
                                                                 seed = seed,
                                                                 verbose=verbose)
        oof_preds_list.append(model_oof_preds)
        
    oof_df = pd.DataFrame(oof_preds_list).T
    oof_df.columns = models_list
    return oof_df



In [13]:
oof_predictions = collect_oof_preds(models_list=models_list, 
                                    param_dict=param_dict, 
                                    kfoldcv=10,
                                    verbose=True)


Working on Model lgb
Currently at  100.00%      
Working on Model xgb
Currently at  100.00%      
Working on Model cat
Currently at  100.00%      
Working on Model hgb
Currently at  100.00%      

# Get Base Model Predictions (Probability)

In [None]:
models_list = ['lgb', 'xgb', 'cat', 'hgb']
avg_probs_list = []
for model_name in models_list:
    print(f'Currently working on model {model_name}')
    train_model_fn = getattr(TM, f"train_{model_name}", None)
    params_trial_fn= getattr(TM, f"sample_{model_name}_hyperparams", None)
    predict_fn = getattr(TM, f"predict_{model_name}", None)
    
    _, avg_probs, _ = TM.run_cv_evaluation_single_model(X=X_train_regular, 
                                  y=target, 
                                  params=param_dict[model_name]['params'], 
                                  train_model_fn=train_model_fn, 
                                  kfoldcv=20,
                                  test_df=X_test_regular,
                                  predict_fn=predict_fn,
                                  seed=42)
    avg_probs_list.append(avg_probs)
test_preds = pd.DataFrame(avg_probs_list).T
test_preds.columns = models_list

Currently working on model lgb
Currently working on model xgb
Currently working on model cat
Currently working on model hgb


# Meta-Model Selection

In [None]:
models_list = ['lgb', 'xgb', 'cat', 'hgb']
##########################################################################
##########################################################################
#################### CHANGE THIS NUMBER TO SWAP MODEL ####################
##########################################################################
##########################################################################
model_name= models_list[2] 

train_model_fn = getattr(TM, f"train_{model_name}", None)
params_trial_fn= getattr(TM, f"sample_{model_name}_hyperparams", None)
predict_fn = getattr(TM, f"predict_{model_name}", None)

# Hyper Parameter Tuning For the Chosen Meta-Model

In [21]:
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(lambda trial: TM.objective_single_model(trial=trial, 
                                       full_train_df=oof_predictions, 
                                       target=target, 
                                       train_model_fn= train_model_fn, 
                                       params_trial_fn = params_trial_fn, 
                                       kfoldcv= 5),
                n_trials=10)

best_threshold = study.best_trial.user_attrs['cv_results'].mean(axis = 0)['threshold']
best_params = study.best_params

[I 2025-05-11 00:20:39,148] A new study created in memory with name: no-name-92b31dae-f96a-4be1-a2a6-2370e05c1adf
[I 2025-05-11 00:20:46,549] Trial 0 finished with value: 0.3820381833450387 and parameters: {'learning_rate': 0.04253553883355031, 'depth': 8, 'l2_leaf_reg': 0.0013706806267854523, 'colsample_bylevel': 0.7927886563536175}. Best is trial 0 with value: 0.3820381833450387.
[I 2025-05-11 00:20:54,716] Trial 1 finished with value: 0.37946472285366595 and parameters: {'learning_rate': 0.0717159689556057, 'depth': 9, 'l2_leaf_reg': 0.7825246626456087, 'colsample_bylevel': 0.9498430932402357}. Best is trial 0 with value: 0.3820381833450387.
[I 2025-05-11 00:21:01,261] Trial 2 finished with value: 0.37862301102904683 and parameters: {'learning_rate': 0.04581341668439562, 'depth': 6, 'l2_leaf_reg': 0.0026597558800065934, 'colsample_bylevel': 0.9867212953835425}. Best is trial 0 with value: 0.3820381833450387.
[I 2025-05-11 00:21:19,511] Trial 3 finished with value: 0.3764921417887418

# Test Set Prediction Using K-Fold CV

In [None]:
cv_result, avg_probs, models_list = TM.run_cv_evaluation_single_model(X=oof_predictions, 
                                  y=target, 
                                  params=best_params, 
                                  train_model_fn=train_model_fn, 
                                  kfoldcv=20,
                                  test_df=test_preds,
                                  predict_fn=predict_fn,
                                  seed=42)

# Save Datasets, Settings, Test Predictions to output_dir

In [44]:
TM.save_settings(train_df=oof_predictions, 
              test_df=test_preds, 
              test_id=test_id,
              test_pred=avg_probs, 
              best_params=best_params, 
              threshold_for_f1=best_threshold, 
              output_dir=output_dir, 
              model_name=model_name)
print(f'Output Directory is at {output_dir}')

Output Directory is at ../Records/lgb_xgb_cat_hgb_temp
