In [1]:
import numpy as np 
import pandas as pd 
import sys
import os
import optuna
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score
import matplotlib.pylab as plt
import warnings
from scipy.stats import skew, kurtosis
from datetime import datetime
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize']=10,20

# Add the grandparent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../')))
from Utils import FE_helper as FE
from Utils import training_models as TM
from tqdm import tqdm 
import json

  from .autonotebook import tqdm as notebook_tqdm


# Importing Data

In [2]:
# 2. Load the data
train_df = pd.read_csv('../Original_Data/train_2025.csv') 
test_df = pd.read_csv('../Original_Data/test_2025.csv')

train_df = FE.add_features(train_df)
test_df = FE.add_features(test_df)

test_id = test_df['claim_number']
train_id = train_df['claim_number']
target = train_df['fraud']

ignore_var = ['claim_date.is_weekend', 'claim_date.near_holiday', 'fraud']
train_df = FE.drop_ignored_columns(train_df, ignore_var)
test_df = FE.drop_ignored_columns(test_df, ignore_var)


# Preprocessing Data. Training and Testing Data Needs To Be Fully Numerical Before Proceeding.

In [3]:
updated_train_df = train_df
updated_test_df = test_df

high_dim_cat_cols_to_drop = ['claim_date.day', 'claim_date.dayofweek', 'claim_date.weekofyear', 'claim_date.month']
updated_train_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True, errors='ignore')
updated_test_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True, errors='ignore')

# Step 1: Fit on training data
onehot, scaler, cat_cols, num_cols = FE.fit_regular_transformer(updated_train_df, '_count')

# Step 2: Transform training set itself
X_train_regular = FE.transform_regular_set(updated_train_df, onehot, scaler, cat_cols, num_cols)

# Step 3: Transform test set (call the same function on test_df)
X_test_regular = FE.transform_regular_set(updated_test_df, onehot, scaler, cat_cols, num_cols)


# Model Selection

In [4]:
models_list = ['lgb', 'xgb', 'cat', 'hgb']

##########################################################################
##########################################################################
#################### CHANGE THIS NUMBER TO SWAP MODEL ####################
##########################################################################
##########################################################################
model_name= models_list[2] 
output_dir = f'../Records/{model_name}_temp'

train_model_fn = getattr(TM, f"train_{model_name}", None)
params_trial_fn= getattr(TM, f"sample_{model_name}_hyperparams", None)
predict_fn = getattr(TM, f"predict_{model_name}", None)

# Hyper Parameter Tuning For the Chosen Model

In [5]:
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(lambda trial: TM.objective_single_model(trial=trial, 
                                       full_train_df=X_train_regular, 
                                       target=target, 
                                       train_model_fn= train_model_fn, 
                                       params_trial_fn = params_trial_fn, 
                                       kfoldcv= 5),
                n_trials=5)

best_threshold = study.best_trial.user_attrs['cv_results'].mean(axis = 0)['threshold']
best_params = study.best_params

[I 2025-05-10 22:31:33,671] A new study created in memory with name: no-name-4e6a4cfc-78ef-4e45-8d79-94005fcbb32c
[I 2025-05-10 22:31:39,225] Trial 0 finished with value: 0.37475310906110126 and parameters: {'learning_rate': 0.06622381059946587, 'depth': 5, 'l2_leaf_reg': 0.0020135036948467724, 'colsample_bylevel': 0.31418920975214865}. Best is trial 0 with value: 0.37475310906110126.
[I 2025-05-10 22:31:51,514] Trial 1 finished with value: 0.364504054418293 and parameters: {'learning_rate': 0.08400979237453378, 'depth': 9, 'l2_leaf_reg': 5.3495874396548535, 'colsample_bylevel': 0.8351323783187699}. Best is trial 0 with value: 0.37475310906110126.
[I 2025-05-10 22:32:08,489] Trial 2 finished with value: 0.36839957027238557 and parameters: {'learning_rate': 0.011770087501584061, 'depth': 7, 'l2_leaf_reg': 0.0016724436807039214, 'colsample_bylevel': 0.690652704620185}. Best is trial 0 with value: 0.37475310906110126.
[I 2025-05-10 22:32:14,430] Trial 3 finished with value: 0.374641223939

# Test Set Prediction Using K-Fold CV

In [6]:
cv_result, avg_probs, models_list = TM.run_cv_evaluation_single_model(X=X_train_regular, 
                                  y=target, 
                                  params=best_params, 
                                  train_model_fn=train_model_fn, 
                                  kfoldcv=20,
                                  test_df=X_test_regular,
                                  predict_fn=predict_fn,
                                  seed=42)

# Save Datasets, Settings, Test Predictions to output_dir

In [None]:
TM.save_settings(train_df=X_train_regular, 
              test_df=X_test_regular, 
              test_id=test_id,
              test_pred=avg_probs, 
              best_params=best_params, 
              threshold_for_f1=best_threshold, 
              output_dir=output_dir, 
              model_name=model_name)
print(f'Output Directory is at {output_dir}')