In [1]:
import numpy as np 
import pandas as pd 
import sys
import os
import optuna
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score
import matplotlib.pylab as plt
import warnings
from scipy.stats import skew, kurtosis
from datetime import datetime
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize']=10,20

# Add the grandparent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../')))
from Utils import FE_helper as FE
from Utils import training_models as TM
from tqdm import tqdm 
import json

  from .autonotebook import tqdm as notebook_tqdm


# Importing Data

In [2]:
# 2. Load the data
train_df = pd.read_csv('../Original_Data/train_2025.csv') 
test_df = pd.read_csv('../Original_Data/test_2025.csv')

train_df = FE.add_features(train_df)
test_df = FE.add_features(test_df)

test_id = test_df['claim_number']
train_id = train_df['claim_number']
target = train_df['fraud']

ignore_var = ['claim_date.is_weekend', 'claim_date.near_holiday', 'fraud']
train_df = FE.drop_ignored_columns(train_df, ignore_var)
test_df = FE.drop_ignored_columns(test_df, ignore_var)


# Preprocessing Data. Training and Testing Data Needs To Be Fully Numerical Before Proceeding.

In [9]:
updated_train_df = train_df
updated_test_df = test_df

high_dim_cat_cols_to_drop = ['claim_date.day', 'claim_date.dayofweek', 'claim_date.weekofyear', 'claim_date.month']
updated_train_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True, errors='ignore')
updated_test_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True, errors='ignore')

# Step 1: Fit on training data
onehot, scaler, cat_cols, num_cols = FE.fit_regular_transformer(updated_train_df, '_count')

# Step 2: Transform training set itself
X_train_regular = FE.transform_regular_set(updated_train_df, onehot, scaler, cat_cols, num_cols)

# Step 3: Transform test set (call the same function on test_df)
X_test_regular = FE.transform_regular_set(updated_test_df, onehot, scaler, cat_cols, num_cols)


# Model Selection

In [10]:
models_list = ['lgb', 'xgb', 'cat', 'hgb']

##########################################################################
##########################################################################
#################### CHANGE THIS NUMBER TO SWAP MODEL ####################
##########################################################################
##########################################################################
model_name= models_list[0] 
output_dir = f'../Records/{model_name}_temp'

train_model_fn = getattr(TM, f"train_{model_name}", None)
params_trial_fn= getattr(TM, f"sample_{model_name}_hyperparams", None)
predict_fn = getattr(TM, f"predict_{model_name}", None)

# Hyper Parameter Tuning For the Chosen Model

In [11]:
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(lambda trial: TM.objective_single_model(trial=trial, 
                                       full_train_df=X_train_regular, 
                                       target=target, 
                                       train_model_fn= train_model_fn, 
                                       params_trial_fn = params_trial_fn, 
                                       kfoldcv= 5),
                n_trials=5)

best_threshold = study.best_trial.user_attrs['cv_results'].mean(axis = 0)['threshold']
best_params = study.best_params

[I 2025-05-14 17:37:53,473] A new study created in memory with name: no-name-799936d2-e73a-49bb-80d2-f2b81d17a83d
[I 2025-05-14 17:38:05,663] Trial 0 finished with value: 0.3632242734518843 and parameters: {'num_leaves': 64, 'feature_fraction': 0.4967467731585185, 'bagging_fraction': 0.7955674268308459, 'bagging_freq': 10, 'learning_rate': 0.06852914839727015, 'lambda_l1': 0.01651103615644541, 'lambda_l2': 0.008299320099156564, 'scale_pos_weight': 1.0}. Best is trial 0 with value: 0.3632242734518843.
[I 2025-05-14 17:38:19,051] Trial 1 finished with value: 0.3707006593664035 and parameters: {'num_leaves': 51, 'feature_fraction': 0.7134694147637359, 'bagging_fraction': 0.771307475747486, 'bagging_freq': 4, 'learning_rate': 0.09464607199085254, 'lambda_l1': 7.593000993722761, 'lambda_l2': 14.222373065655479, 'scale_pos_weight': 1.0}. Best is trial 1 with value: 0.3707006593664035.
[I 2025-05-14 17:38:33,271] Trial 2 finished with value: 0.35075535419723913 and parameters: {'num_leaves': 

# Test Set Prediction Using K-Fold CV

In [12]:
cv_result, avg_probs, models_list = TM.run_cv_evaluation_single_model(X=X_train_regular, 
                                  y=target, 
                                  params=best_params, 
                                  train_model_fn=train_model_fn, 
                                  kfoldcv=20,
                                  test_df=X_test_regular,
                                  predict_fn=predict_fn,
                                  seed=42)

In [20]:
import pandas as pd
import matplotlib.pyplot as plt

def get_lgb_feature_importance(models, feature_names=None, plot=False, top_n=20):
    """
    Aggregates feature importance from a list of LightGBM models.

    Args:
        models (list): List of trained lightgbm.Booster objects.
        feature_names (list, optional): List of feature names. Required if not stored in model.
        plot (bool): Whether to plot the top_n most important features.
        top_n (int): Number of top features to display in the plot.

    Returns:
        pd.DataFrame: DataFrame with feature importances (mean and std across models).
    """
    all_importances = []

    for model in models:
        imp = model.feature_importance(importance_type='gain')
        if feature_names is None:
            feature_names = model.feature_name()
        all_importances.append(pd.Series(imp, index=feature_names))

    # Combine into DataFrame
    imp_df = pd.concat(all_importances, axis=1)
    imp_df.columns = [f'model_{i}' for i in range(len(models))]
    imp_df['mean_gain'] = imp_df.mean(axis=1)
    imp_df['std_gain'] = imp_df.std(axis=1)
    imp_df = imp_df.sort_values('mean_gain', ascending=False).reset_index().rename(columns={'index': 'feature'})

    if plot:
        top_features = imp_df.head(top_n)
        plt.figure(figsize=(10, 6))
        plt.barh(top_features['feature'][::-1], top_features['mean_gain'][::-1])
        plt.xlabel('Mean Gain Importance')
        plt.title(f'Top {top_n} Feature Importances (LightGBM)')
        plt.tight_layout()
        plt.show()

    return imp_df




In [27]:
importance_df = get_lgb_feature_importance(models_list)
sub_imp_df = importance_df[['feature', 'mean_gain', 'std_gain']]

In [31]:
sub_imp_df[sub_imp_df['feature'] == 'lat']

Unnamed: 0,feature,mean_gain,std_gain
12,lat,312.399395,169.313432


# Save Datasets, Settings, Test Predictions to output_dir

In [7]:
TM.save_settings(train_df=X_train_regular, 
              test_df=X_test_regular, 
              test_id=test_id,
              test_pred=avg_probs, 
              best_params=best_params, 
              threshold_for_f1=best_threshold, 
              output_dir=output_dir, 
              model_name=model_name)
print(f'Output Directory is at {output_dir}')

Output Directory is at ../Records/xgb_temp
