In [12]:
import numpy as np 
import pandas as pd 
import sys
import os
import optuna
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score
import matplotlib.pylab as plt
import warnings
from scipy.stats import skew, kurtosis
from datetime import datetime
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize']=10,20

# Add the grandparent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../')))
from Utils import FE_helper as FE
from tqdm import tqdm 

In [2]:
# 2. Load the data
train_df = pd.read_csv('../Original_Data/train_2025.csv') 
test_df = pd.read_csv('../Original_Data/test_2025.csv')

train_df = FE.add_features(train_df)
test_df = FE.add_features(test_df)

test_id = test_df['claim_number']
train_id = train_df['claim_number']
target = train_df['fraud']

ignore_var = ['claim_date.is_weekend', 'claim_date.near_holiday', 'fraud']
train_df = FE.drop_ignored_columns(train_df, ignore_var)
test_df = FE.drop_ignored_columns(test_df, ignore_var)


In [3]:
def compute_row_statistics(df, prefix='row'):
    """
    Compute row-wise statistics: mean, std, skewness, kurtosis.

    Parameters:
    ----------
    df : pd.DataFrame
        Input DataFrame of features (numeric).
    prefix : str, optional
        Prefix to add to the new column names (default: 'row').

    Returns:
    -------
    pd.DataFrame
        DataFrame with 4 new columns: mean, std, skewness, kurtosis.
    """
    row_mean = df.mean(axis=1)
    row_std = df.std(axis=1)
    row_skew = df.apply(skew, axis=1)
    row_kurt = df.apply(kurtosis, axis=1)

    stats_df = pd.DataFrame({
        f'{prefix}_mean': row_mean,
        f'{prefix}_std': row_std,
        f'{prefix}_skew': row_skew,
        f'{prefix}_kurt': row_kurt,
    }).reset_index(drop=True)
    
    return stats_df

def drop_columns_by_missing_ratio(df, max_missing_ratio=0.3):
    """
    Drops columns where the percentage of NA entries exceeds the given threshold.

    Parameters:
    -----------
    df : pd.DataFrame
        The DataFrame to filter.
    max_missing_ratio : float
        Maximum fraction of missing values allowed per column (0 to 1).

    Returns:
    --------
    pd.DataFrame
        The cleaned DataFrame with fewer columns.
    """
    missing_ratio = df.isna().mean()
    return df.loc[:, missing_ratio <= max_missing_ratio]


In [21]:
def train_and_predict_legit_model_ensemble(X_train, y_train, X_test, clean_params, n_runs=5, seed_base=42, verbose_eval=100):
    """
    Trains multiple models using different seeds and returns averaged predictions.

    Parameters:
    ----------
    X_train : pd.DataFrame
        Training feature set.

    y_train : pd.Series
        Training labels.

    X_test : pd.DataFrame
        Test set to make predictions on.

    clean_params : dict
        XGBoost parameters (already cleaned).

    n_runs : int
        Number of different seeds/models to average.

    seed_base : int
        Starting seed; seeds will be seed_base + i.

    verbose_eval : int or bool
        Verbosity of training (default=100).

    Returns:
    -------
    np.array
        Averaged predicted probabilities for the X_test set.
    """
    all_preds = []

    for i in range(n_runs):
        run_seed = seed_base + i
        params = clean_params.copy()
        params['seed'] = run_seed

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test)

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=2000,
            evals=[(dtrain, 'train')],
            early_stopping_rounds=50,
            verbose_eval=verbose_eval
        )

        preds = model.predict(dtest, iteration_range=(0, model.best_iteration + 1))
        all_preds.append(preds)

    avg_preds = np.mean(all_preds, axis=0)
    return avg_preds


In [4]:
level = 10
presence_info_df = pd.read_csv(f'logs/subset_info_{level}.csv')



def process_presence_features(train_df, level, presence_info_df=presence_info_df):

    presence_info_df_min_var = presence_info_df[presence_info_df['variance'] > 0.1]

    cols_diff_5_6_pos = presence_info_df_min_var[(presence_info_df_min_var['difference'] >= 0.05) & (presence_info_df_min_var['difference'] < 0.06)]
    cols_diff_6_7_pos = presence_info_df_min_var[(presence_info_df_min_var['difference'] >= 0.06) & (presence_info_df_min_var['difference'] < 0.07)]
    cols_diff_7_8_pos = presence_info_df_min_var[(presence_info_df_min_var['difference'] >= 0.07) & (presence_info_df_min_var['difference'] < 0.08)]
    cols_diff_5_6_neg = presence_info_df_min_var[(presence_info_df_min_var['difference'] <= -0.05) & (presence_info_df_min_var['difference'] > -0.06)]
    cols_diff_6_7_neg = presence_info_df_min_var[(presence_info_df_min_var['difference'] <= -0.06) & (presence_info_df_min_var['difference'] > -0.07)]
    cols_diff_7_8_meg = presence_info_df_min_var[(presence_info_df_min_var['difference'] <= -0.07) & (presence_info_df_min_var['difference'] > -0.08)]
    cols_diff_8_abv = presence_info_df_min_var[(np.abs(presence_info_df_min_var['difference']) >= 0.08)]

    train_df_56_pos = FE.add_presence_columns(train_df, cols_diff_5_6_pos, level, new_features_only=True)
    train_df_67_pos = FE.add_presence_columns(train_df, cols_diff_6_7_pos, level, new_features_only=True)
    train_df_78_pos = FE.add_presence_columns(train_df, cols_diff_7_8_pos, level, new_features_only=True)
    train_df_56_neg = FE.add_presence_columns(train_df, cols_diff_5_6_neg, level, new_features_only=True)
    train_df_67_neg = FE.add_presence_columns(train_df, cols_diff_6_7_neg, level, new_features_only=True)
    train_df_78_neg = FE.add_presence_columns(train_df, cols_diff_7_8_meg, level, new_features_only=True)
    train_df_8_abv = FE.add_presence_columns(train_df, cols_diff_8_abv, level, new_features_only=True)

    pos_56_stats = compute_row_statistics(train_df_56_pos, 'pos_56')
    pos_67_stats = compute_row_statistics(train_df_67_pos, 'pos_67')
    pos_78_stats = compute_row_statistics(train_df_78_pos, 'pos_78')
    neg_56_stats = compute_row_statistics(train_df_56_neg, 'neg_56')
    neg_67_stats = compute_row_statistics(train_df_67_neg, 'neg_67')
    neg_78_stats = compute_row_statistics(train_df_78_neg, 'neg_78')
    abs_8abv_stats = compute_row_statistics(train_df_8_abv, 'abs_8abv')

    summary_stats_full = pd.concat([pos_56_stats, pos_67_stats, pos_78_stats, neg_56_stats, neg_67_stats, neg_78_stats, abs_8abv_stats], axis= 1)
    presence_features_full = pd.concat([summary_stats_full, train_df_8_abv], axis = 1)
    
    return(presence_features_full)

In [5]:
presence_features_full_train = process_presence_features(train_df, level)
presence_features_full_test = process_presence_features(test_df, level)

In [6]:
missing_ratio = presence_features_full_train.isna().mean()
high_na_stats_col = presence_features_full_train.loc[:, missing_ratio > 0.1].columns.tolist()
count_cols = [col for col in presence_features_full_train.columns if col.endswith('_count')]

presence_features_full_train_final = presence_features_full_train.drop(columns = count_cols + high_na_stats_col)
presence_features_full_test_final = presence_features_full_test.drop(columns = count_cols + high_na_stats_col)

In [14]:
updated_train_df = train_df
updated_test_df = test_df

present_cols = [col for col in updated_train_df.columns if col.endswith('_count')]
# print(updated_train_df[non_present_cols].info())


high_dim_cat_cols_to_drop = ['claim_date.day', 'claim_date.dayofweek', 'claim_date.weekofyear', 'claim_date.month']
updated_train_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True, errors='ignore')
updated_test_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True, errors='ignore')

# Step 1: Fit on training data
onehot, scaler, cat_cols, num_cols = FE.fit_regular_transformer(updated_train_df, '_count')

# Step 2: Transform training set itself
X_train_regular = FE.transform_regular_set(updated_train_df, onehot, scaler, cat_cols, num_cols)

# Step 3: Transform test set (call the same function on test_df)
X_test_regular = FE.transform_regular_set(updated_test_df, onehot, scaler, cat_cols, num_cols)


In [16]:
def objective(trial, full_train_df, target, kfoldcv=5, drop=[]):
    
    # Define hyperparameter space
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'gpu_hist',  # or 'gpu_hist' if using GPU
        'max_depth': trial.suggest_int('max_depth', 1, 15),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.3, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.9),
        'eta': trial.suggest_float('learning_rate', 0.005, 0.1),
        'lambda': trial.suggest_float('lambda', 0.01, 25.0, log=True),
        'alpha': trial.suggest_float('alpha', 0.01, 20.0, log=True),
        'seed': 69,
        'verbosity': 0  # quiet mode
    }

    skf = StratifiedKFold(n_splits=kfoldcv, shuffle=True, random_state=42)
    best_thresholds = []
    f1_scores = []

    for train_idx, val_idx in skf.split(full_train_df, target):
        X_train = full_train_df.iloc[train_idx].drop(columns=drop, errors='ignore')
        X_val = full_train_df.iloc[val_idx].drop(columns=drop, errors='ignore')
        y_train = target.iloc[train_idx]
        y_val = target.iloc[val_idx]

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=2000,
            evals=[(dval, 'validation')],
            early_stopping_rounds=50,
            verbose_eval=False  # silent logs
        )

        # Predict + threshold tuning
        probs = model.predict(dval, iteration_range=(0, model.best_iteration + 1))
        thresholds = np.linspace(0.1, 0.9, 200)
        f1s = [f1_score(y_val, probs > t) for t in thresholds]

        best_f1 = max(f1s)
        best_threshold = thresholds[np.argmax(f1s)]

        f1_scores.append(best_f1)
        best_thresholds.append(best_threshold)

    mean_f1 = np.mean(f1_scores)
    mean_threshold = np.mean(best_thresholds)

    trial.set_user_attr('mean_threshold', mean_threshold)
    trial.set_user_attr('f1_per_fold', f1_scores)

    return mean_f1

In [17]:
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(lambda trial: objective(trial, X_train_regular, target, kfoldcv= 5),
                n_trials=350)



[I 2025-05-10 00:14:51,039] A new study created in memory with name: no-name-59c49123-854c-4614-92ed-c9f6b07bfe96
[I 2025-05-10 00:14:59,076] Trial 0 finished with value: 0.36329954542487225 and parameters: {'max_depth': 7, 'min_child_weight': 0.4342184624885491, 'subsample': 0.4868788584472137, 'colsample_bytree': 0.5416959278456943, 'learning_rate': 0.015256268423659763, 'lambda': 0.11623334589146023, 'alpha': 0.23038348370563538}. Best is trial 0 with value: 0.36329954542487225.
[I 2025-05-10 00:15:05,311] Trial 1 finished with value: 0.36370536863259834 and parameters: {'max_depth': 12, 'min_child_weight': 0.004464721272941384, 'subsample': 0.8562912703010037, 'colsample_bytree': 0.6779598443911453, 'learning_rate': 0.08680959574016894, 'lambda': 0.14018549376349249, 'alpha': 9.908358201284033}. Best is trial 1 with value: 0.36370536863259834.
[I 2025-05-10 00:15:09,621] Trial 2 finished with value: 0.37540359365569 and parameters: {'max_depth': 2, 'min_child_weight': 0.08847588469

In [19]:
best_threshold = study.best_trial.user_attrs['mean_threshold']
best_params = study.best_params
best_params.update({'mean_threshold': float(best_threshold)})

In [26]:
# Split train into final train and validation sets
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    X_train_regular, target, test_size=0.05, stratify=target, random_state=42
)

# Clean best_params: keep only what XGBoost expects
xgb_param_keys = [
    'objective', 'eval_metric', 'tree_method', 'max_depth', 'min_child_weight',
    'subsample', 'colsample_bytree', 'eta', 'lambda', 'alpha', 'seed', 'verbosity'
]
clean_params = {k: best_params[k] for k in xgb_param_keys if k in best_params}

# Set required XGBoost parameters
clean_params.update({
    'objective': 'binary:logistic',
    'eval_metric': ['auc'],  # or just 'auc'
    'tree_method': 'gpu_hist',  # or 'gpu_hist' if using GPU
    'seed': 42,
    'verbosity': 0  # silent mode
})

# Convert to DMatrix
dtrain = xgb.DMatrix(X_train_final, label=y_train_final)
dval = xgb.DMatrix(X_val_final, label=y_val_final)
dtest = xgb.DMatrix(X_test_regular)

# Train with early stopping
final_model = xgb.train(
    clean_params,
    dtrain,
    num_boost_round=5000,
    evals=[(dval, 'validation')],
    early_stopping_rounds=100,
    verbose_eval=100  # adjust or set to False for silence
)

# Predict on test set (probabilities)
probs_test = final_model.predict(dtest, iteration_range=(0, final_model.best_iteration + 1))


[0]	validation-auc:0.56941
[100]	validation-auc:0.74257
[200]	validation-auc:0.73939
[261]	validation-auc:0.73695


In [27]:
timestamp = datetime.now().strftime('%m%d_%H%M')

final_preds = (probs_test > best_threshold).astype(int)

submission = pd.DataFrame({
    'claim_number': test_id,
    'fraud': final_preds
})

submission.to_csv(f'../Records/xgb_temp/submission.csv', index=False)

In [None]:

xgb_param_keys = [
    'objective', 'eval_metric', 'tree_method', 'max_depth', 'min_child_weight',
    'subsample', 'colsample_bytree', 'eta', 'lambda', 'alpha', 'seed', 'verbosity'
]
clean_params = {k: best_params[k] for k in xgb_param_keys if k in best_params}
clean_params.update({
    'objective': 'binary:logistic',
    'eval_metric': ['auc'],
    'tree_method': 'gpu_hist',
    'verbosity': 0
})

from sklearn.model_selection import StratifiedKFold, train_test_split

# 1️⃣ Set up KFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, target)):
    print(f"\n===== Fold {fold + 1} =====")
    
    # 🔹 Full indices
    full_X = train_df.iloc[train_idx]
    full_y = target.iloc[train_idx]
    
    X_val_full = train_df.iloc[val_idx]
    y_val_full = target.iloc[val_idx]

    # 2️⃣ Split: 70% train, 30% (Val1 + Val2)
    X_train_legit, X_temp, y_train_legit, y_temp = train_test_split(
        full_X, full_y, test_size=0.3, stratify=full_y, random_state=42
    )

    # Split X_temp into Val1 and Val2 (each 15%)
    X_val1, X_val2, y_val1, y_val2 = train_test_split(
        X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
    )

    # 3️⃣ Train legit model and get predictions
    legit_preds_val1 = train_and_predict_legit_model_ensemble(
        X_train_legit, y_train_legit, X_val1, clean_params, n_runs=5
    )

    # 4️⃣ Build features for stacked model
    # ➔ Create DataFrame of predictions
    legit_preds_df = pd.DataFrame({'legit_model_pred': legit_preds_val1}, index=X_val1.index)

    # ➔ Get presence-based + summary features for Val1
    presence_val1 = presence_features_full_train_final.loc[X_val1.index]

    # ➔ Concatenate legit preds + presence features
    X_stack_train = pd.concat([legit_preds_df, presence_val1], axis=1)
    y_stack_train = y_val1

    # ➔ For Val2 (final eval)
    legit_preds_val2 = train_and_predict_legit_model_ensemble(
        X_train_legit, y_train_legit, X_val2, clean_params, n_runs=5
    )
    legit_preds_val2_df = pd.DataFrame({'legit_model_pred': legit_preds_val2}, index=X_val2.index)
    presence_val2 = presence_features_full_train_final.loc[X_val2.index]
    X_stack_val = pd.concat([legit_preds_val2_df, presence_val2], axis=1)
    y_stack_val = y_val2

    # 5️⃣ Train stacked model (use XGBoost, LightGBM, or CatBoost)
    # Example with XGBoost:
    dtrain_stack = xgb.DMatrix(X_stack_train, label=y_stack_train)
    dval_stack = xgb.DMatrix(X_stack_val, label=y_stack_val)

    stacked_model = xgb.train(
        clean_params,  # You may want different params here!
        dtrain_stack,
        num_boost_round=2000,
        evals=[(dval_stack, 'validation')],
        early_stopping_rounds=100,
        verbose_eval=100
    )

    # Optionally: collect metrics, save models, etc.

