In [1]:
import numpy as np 
import pandas as pd 
import sys
import os
from sklearn.model_selection import train_test_split
import optuna
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pylab as plt
import warnings
from datetime import datetime
import itertools
from scipy import stats
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize']=10,20

# Add the grandparent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../')))
from Utils import FE_helper as FE


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 2. Load the data
train_df = pd.read_csv('../Original_Data/train_2025.csv') 
test_df = pd.read_csv('../Original_Data/test_2025.csv')

train_df = FE.add_features(train_df)
test_df = FE.add_features(test_df)

test_id = test_df['claim_number']
train_id = train_df['claim_number']
target = train_df['fraud']

ignore_var = ['claim_date.is_weekend', 'claim_date.near_holiday', 'fraud']
train_df = FE.drop_ignored_columns(train_df, ignore_var)
test_df = FE.drop_ignored_columns(test_df, ignore_var)

presence_info_df_3 = pd.read_csv('logs/subset_info_3.csv')

In [4]:
def add_presence_columns(train_df, presence_info_df, verbose = False):
    """
    For each combo in presence_info_df, create a presence feature on train_df.
    
    Parameters:
    - train_df (pd.DataFrame): The training dataset.
    - presence_info_df (pd.DataFrame): DataFrame containing 'feature' column
      with names like 'feature1__feature2__feature3_present'.
    
    Returns:
    - pd.DataFrame: train_df with new presence columns added.
    """
    df_out = train_df.copy()
    
    for combo_str in presence_info_df['feature']:
        # Extract base combo name (strip trailing '_present')
        if combo_str.endswith('_present'):
            combo_base = combo_str[:-8]
        else:
            combo_base = combo_str
        
        # Split by '__' to get the individual features
        combo_features = combo_base.split('__')
        new_col_name = combo_base + '_present'  # keep consistent
        if verbose:
            print(f"Processing combo: {combo_features}")
        
        # Build tuple of feature values per row
        combo_tuples = train_df[combo_features].apply(tuple, axis=1)
        
        # Count how many times each tuple appears
        counts = combo_tuples.map(combo_tuples.value_counts())
        
        # Presence = appears more than once in the dataset
        df_out[new_col_name] = (counts > 1).astype(int)
    
    return df_out


def fit_presence_pca(train_df, present_cols, n_components=None, scale=True):
    """
    Fits PCA on presence feature columns in the training set.
    
    Parameters:
    - train_df (pd.DataFrame): Training dataset.
    - present_cols (list): List of column names for presence features.
    - n_components (int or None): Number of PCA components. If None, keep all.
    - scale (bool): Whether to standardize columns before PCA.
    
    Returns:
    - pca (PCA object): Fitted PCA object.
    - X_train_pca (np.ndarray): Transformed training set (PCA scores).
    - scaler (StandardScaler object or None): Fitted scaler if used, else None.
    """
    X = train_df[present_cols].values

    # Optionally scale features
    scaler = None
    if scale:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

    # Fit PCA
    pca = PCA(n_components=n_components, random_state=42)
    X_train_pca = pca.fit_transform(X)

    print(f"PCA fitted. Explained variance (first 10 components): {pca.explained_variance_ratio_[:10]}")
    return pca, X_train_pca, scaler


import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

def fit_regular_transformer(train_df, presence_suffix='_present'):
    # Identify regular columns
    regular_cols = [col for col in train_df.columns if not col.endswith(presence_suffix)]
    
    # Split regular into categorical and numerical
    categorical_cols = train_df[regular_cols].select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = train_df[regular_cols].select_dtypes(include=['number']).columns.tolist()
    if 'claim_number' in numerical_cols:
        numerical_cols.remove('claim_number')
    
    # Initialize transformers
    onehot = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    scaler = StandardScaler()
    
    # Fit transformers
    onehot.fit(train_df[categorical_cols])
    scaler.fit(train_df[numerical_cols])
    
    # print(f"Fitted on {len(categorical_cols)} categorical and {len(numerical_cols)} numerical columns.")
    
    return onehot, scaler, categorical_cols, numerical_cols

def transform_regular_set(df, onehot, scaler, categorical_cols, numerical_cols):
    # Transform categorical
    cat_transformed = onehot.transform(df[categorical_cols])
    cat_df = pd.DataFrame(cat_transformed, columns=onehot.get_feature_names_out(categorical_cols), index=df.index)
    
    # Transform numerical
    num_transformed = scaler.transform(df[numerical_cols])
    num_df = pd.DataFrame(num_transformed, columns=numerical_cols, index=df.index)
    
    # Combine transformed parts
    transformed_df = pd.concat([num_df, cat_df], axis=1)
    
    # print(f"Transformed set shape: {transformed_df.shape}")
    return transformed_df


In [None]:
def objective(trial, train_df, presence_info_df, target, kfoldcv=5, drop=[]):
    # Dynamic feature selection thresholds
    difference_min = trial.suggest_float('difference_min', 0.03, 0.07)
    info_min = trial.suggest_float('info_min', 0.03, 0.05)
    presence_info_df = presence_info_df[
        (np.abs(presence_info_df['difference']) > difference_min) &
        (presence_info_df['info'] > info_min)
    ]

    trial.set_user_attr('difference_min', difference_min)
    trial.set_user_attr('info_min', info_min)

    # Add presence features
    updated_train_df = add_presence_columns(train_df, presence_info_df)
    present_cols = [col for col in updated_train_df.columns if col.endswith('_present')]

    # Drop problematic high-cardinality categorical columns
    high_dim_cat_cols_to_drop = [
        'claim_date.day', 'claim_date.dayofweek', 'claim_date.weekofyear',
        'claim_date.month', 'zero_payout'
    ]
    updated_train_df.drop(columns=high_dim_cat_cols_to_drop, inplace=True)

    # Transform regular features
    onehot, scaler, cat_cols, num_cols = fit_regular_transformer(updated_train_df)
    X_train_regular = transform_regular_set(updated_train_df, onehot, scaler, cat_cols, num_cols)

    # Combine with presence features
    full_train_df = pd.concat([X_train_regular, updated_train_df[present_cols]], axis=1)

    # Define hyperparameter space
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'gpu_hist',  # or 'gpu_hist' if using GPU
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.3, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.9),
        'eta': trial.suggest_float('learning_rate', 0.005, 0.1),
        'lambda': trial.suggest_float('lambda', 1e-3, 25.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 20.0, log=True),
        'seed': 69,
        'verbosity': 0  # quiet mode
    }

    skf = StratifiedKFold(n_splits=kfoldcv, shuffle=True, random_state=42)
    best_thresholds = []
    f1_scores = []

    for train_idx, val_idx in skf.split(full_train_df, target):
        X_train = full_train_df.iloc[train_idx].drop(columns=drop, errors='ignore')
        X_val = full_train_df.iloc[val_idx].drop(columns=drop, errors='ignore')
        y_train = target.iloc[train_idx]
        y_val = target.iloc[val_idx]

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=2000,
            evals=[(dval, 'validation')],
            early_stopping_rounds=50,
            verbose_eval=False  # silent logs
        )

        # Predict + threshold tuning
        probs = model.predict(dval, iteration_range=(0, model.best_iteration + 1))
        thresholds = np.linspace(0.1, 0.9, 50)
        f1s = [f1_score(y_val, probs > t) for t in thresholds]

        best_f1 = max(f1s)
        best_threshold = thresholds[np.argmax(f1s)]

        f1_scores.append(best_f1)
        best_thresholds.append(best_threshold)

    mean_f1 = np.mean(f1_scores)
    mean_threshold = np.mean(best_thresholds)

    trial.set_user_attr('mean_threshold', mean_threshold)
    trial.set_user_attr('f1_per_fold', f1_scores)

    return mean_f1

In [7]:
presence_info_df_3 = pd.read_csv('logs/subset_info_3.csv')
# Run Optuna
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(lambda trial: objective(trial, train_df, presence_info_df_3, target, kfoldcv= 5),
                n_trials=350)


best_f1 = study.best_value

[I 2025-05-06 17:52:31,389] A new study created in memory with name: no-name-974eee69-baed-4995-bdd3-98bacc4f92d8
[I 2025-05-06 17:52:34,777] Trial 0 finished with value: 0.37031090981292497 and parameters: {'difference_min': 0.06017533968132904, 'info_min': 0.035524038592795855, 'max_depth': 5, 'min_child_weight': 2.380603085126726, 'subsample': 0.5118323651604653, 'colsample_bytree': 0.6983327679034905, 'learning_rate': 0.062945190210375, 'lambda': 5.6710389196128395, 'alpha': 8.08878268174038}. Best is trial 0 with value: 0.37031090981292497.
[I 2025-05-06 17:52:43,430] Trial 1 finished with value: 0.3725408257362706 and parameters: {'difference_min': 0.044975339428914185, 'info_min': 0.03147177877569223, 'max_depth': 5, 'min_child_weight': 0.21710128295789996, 'subsample': 0.36831725772675505, 'colsample_bytree': 0.6678037735237604, 'learning_rate': 0.09066422755673474, 'lambda': 0.01794156434460334, 'alpha': 4.856599124678148}. Best is trial 1 with value: 0.3725408257362706.
[I 20

In [11]:
top_trials = sorted(study.trials, key=lambda t: t.value, reverse=True)[:5]  # For minimization
# or reverse=True if you're maximizing

# Print parameters of top trials
for i, trial in enumerate(top_trials, 1):
    print(f"Top {i}:")
    print(f"  Value: {trial.value}")
    print(f"  Params: {trial.params}")

Top 1:
  Value: 0.3818334954271173
  Params: {'difference_min': 0.05218960132104022, 'info_min': 0.03089648066623365, 'max_depth': 3, 'min_child_weight': 0.01436830166963303, 'subsample': 0.5535868944752692, 'colsample_bytree': 0.7846012921773652, 'learning_rate': 0.06806594650165576, 'lambda': 3.7018701051348804, 'alpha': 6.572640433297191}
Top 2:
  Value: 0.38176523111697397
  Params: {'difference_min': 0.05312839082120633, 'info_min': 0.030736703106280343, 'max_depth': 3, 'min_child_weight': 0.32370419594904, 'subsample': 0.59928822224624, 'colsample_bytree': 0.7678473689425825, 'learning_rate': 0.05211631620033941, 'lambda': 3.586250039728482, 'alpha': 10.884899562909776}
Top 3:
  Value: 0.3811044865717462
  Params: {'difference_min': 0.051872682321035646, 'info_min': 0.03099618801080478, 'max_depth': 3, 'min_child_weight': 0.17417128165061507, 'subsample': 0.6350551258507653, 'colsample_bytree': 0.8309610101172697, 'learning_rate': 0.06606518113744322, 'lambda': 0.6177716658941924

In [12]:
difference_min, info_min = study.best_trial.user_attrs['difference_min'], study.best_trial.user_attrs['info_min']

presence_info_df_3 = presence_info_df_3[(np.abs(presence_info_df_3['difference']) > difference_min) & (presence_info_df_3['info'] > info_min)]
presence_info_df = presence_info_df_3

updated_train_df = add_presence_columns(train_df, presence_info_df)
updated_test_df = add_presence_columns(test_df, presence_info_df)

present_cols = [col for col in updated_train_df.columns if col.endswith('_present')]
# print(updated_train_df[non_present_cols].info())


high_dim_cat_cols_to_drop = ['claim_date.day', 'claim_date.dayofweek', 'claim_date.weekofyear', 'claim_date.month', 'zero_payout']
updated_train_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True)
updated_test_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True)

# Step 1: Fit on training data
onehot, scaler, cat_cols, num_cols = fit_regular_transformer(updated_train_df)

# Step 2: Transform training set itself
X_train_regular = transform_regular_set(updated_train_df, onehot, scaler, cat_cols, num_cols)

# Step 3: Transform test set (call the same function on test_df)
X_test_regular = transform_regular_set(test_df, onehot, scaler, cat_cols, num_cols)

# present_cols = []
# Combine for train
updated_train_final = pd.concat([X_train_regular, updated_train_df[present_cols]], axis=1)

# Combine for test
updated_test_final = pd.concat([X_test_regular, updated_test_df[present_cols]], axis=1)

In [None]:
# Paths
import json
output_dir = '../Records/xgb_temp'
train_csv_path = os.path.join(output_dir, 'train_2025.csv')
test_csv_path = os.path.join(output_dir, 'test_2025.csv')
param_json_path = os.path.join(output_dir, 'param_xgb_temp.json')

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Save DataFrames
updated_train_final.to_csv(train_csv_path, index=False)
print(f"Saved train DataFrame to {train_csv_path}")

updated_test_final.to_csv(test_csv_path, index=False)
print(f"Saved test DataFrame to {test_csv_path}")

# Save best_params as JSON
best_threshold = study.best_trial.user_attrs['mean_threshold']
best_params = study.best_params
best_params.update({'mean_threshold': float(best_threshold)})
with open(param_json_path, 'w') as f:
    json.dump(best_params, f, indent=4)
print(f"Saved best parameters to {param_json_path}")

Saved train DataFrame to ../Records/xgb_temp\train_2025.csv
Saved test DataFrame to ../Records/xgb_temp\test_2025.csv
Saved best parameters to ../Records/xgb_temp\param_xgb_temp.json


In [None]:
# Split train into final train and validation sets
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    updated_train_final, target, test_size=0.01, stratify=target, random_state=42
)

# Clean best_params: keep only what XGBoost expects
xgb_param_keys = [
    'objective', 'eval_metric', 'tree_method', 'max_depth', 'min_child_weight',
    'subsample', 'colsample_bytree', 'eta', 'lambda', 'alpha', 'seed', 'verbosity'
]
clean_params = {k: best_params[k] for k in xgb_param_keys if k in best_params}

# Set required XGBoost parameters
clean_params.update({
    'objective': 'binary:logistic',
    'eval_metric': ['auc'],  # or just 'auc'
    'tree_method': 'gpu_hist',  # or 'gpu_hist' if using GPU
    'seed': 42,
    'verbosity': 0  # silent mode
})

# Convert to DMatrix
dtrain = xgb.DMatrix(X_train_final, label=y_train_final)
dval = xgb.DMatrix(X_val_final, label=y_val_final)
dtest = xgb.DMatrix(updated_test_final)

# Train with early stopping
final_model = xgb.train(
    clean_params,
    dtrain,
    num_boost_round=5000,
    evals=[(dval, 'validation')],
    early_stopping_rounds=100,
    verbose_eval=100  # adjust or set to False for silence
)

# Predict on test set (probabilities)
probs_test = final_model.predict(dtest, iteration_range=(0, final_model.best_iteration + 1))


[0]	validation-auc:0.72686
[100]	validation-auc:0.71828
[136]	validation-auc:0.69079


In [18]:
timestamp = datetime.now().strftime('%m%d_%H%M')

final_preds = (probs_test > best_threshold).astype(int)

submission = pd.DataFrame({
    'claim_number': test_id,
    'fraud': final_preds
})

submission.to_csv(f'../Submit/submission_xgb.csv', index=False)