In [2]:
import numpy as np 
import pandas as pd 
import sys
import os
from sklearn.model_selection import train_test_split
import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pylab as plt
import warnings
from datetime import datetime
import itertools
from scipy import stats

warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize']=10,20

# Add the grandparent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../')))
from Utils import FE_helper as FE


In [11]:
# 2. Load the data
train_df = pd.read_csv('../Original_Data/train_2025.csv') 
test_df = pd.read_csv('../Original_Data/test_2025.csv')

train_df = FE.add_features(train_df)
test_df = FE.add_features(test_df)

test_id = test_df['claim_number']
train_id = train_df['claim_number']
target = train_df['fraud']

ignore_var = ['claim_date.is_weekend', 'claim_date.near_holiday', 'fraud']
train_df = FE.drop_ignored_columns(train_df, ignore_var)
test_df = FE.drop_ignored_columns(test_df, ignore_var)

In [12]:
def add_presence_columns(train_df, presence_info_df, verbose = False):
    """
    For each combo in presence_info_df, create a presence feature on train_df.
    
    Parameters:
    - train_df (pd.DataFrame): The training dataset.
    - presence_info_df (pd.DataFrame): DataFrame containing 'feature' column
      with names like 'feature1__feature2__feature3_present'.
    
    Returns:
    - pd.DataFrame: train_df with new presence columns added.
    """
    df_out = train_df.copy()
    
    for combo_str in presence_info_df['feature']:
        # Extract base combo name (strip trailing '_present')
        if combo_str.endswith('_present'):
            combo_base = combo_str[:-8]
        else:
            combo_base = combo_str
        
        # Split by '__' to get the individual features
        combo_features = combo_base.split('__')
        new_col_name = combo_base + '_present'  # keep consistent
        if verbose:
            print(f"Processing combo: {combo_features}")
        
        # Build tuple of feature values per row
        combo_tuples = train_df[combo_features].apply(tuple, axis=1)
        
        # Count how many times each tuple appears
        counts = combo_tuples.map(combo_tuples.value_counts())
        
        # Presence = appears more than once in the dataset
        df_out[new_col_name] = (counts > 1).astype(int)
    
    return df_out


def fit_presence_pca(train_df, present_cols, n_components=None, scale=True):
    """
    Fits PCA on presence feature columns in the training set.
    
    Parameters:
    - train_df (pd.DataFrame): Training dataset.
    - present_cols (list): List of column names for presence features.
    - n_components (int or None): Number of PCA components. If None, keep all.
    - scale (bool): Whether to standardize columns before PCA.
    
    Returns:
    - pca (PCA object): Fitted PCA object.
    - X_train_pca (np.ndarray): Transformed training set (PCA scores).
    - scaler (StandardScaler object or None): Fitted scaler if used, else None.
    """
    X = train_df[present_cols].values

    # Optionally scale features
    scaler = None
    if scale:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

    # Fit PCA
    pca = PCA(n_components=n_components, random_state=42)
    X_train_pca = pca.fit_transform(X)

    print(f"PCA fitted. Explained variance (first 10 components): {pca.explained_variance_ratio_[:10]}")
    return pca, X_train_pca, scaler



def fit_regular_transformer(train_df, presence_suffix='_present'):
    # Identify regular columns
    regular_cols = [col for col in train_df.columns if not col.endswith(presence_suffix)]
    
    # Split regular into categorical and numerical
    categorical_cols = train_df[regular_cols].select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = train_df[regular_cols].select_dtypes(include=['number']).columns.tolist()
    if 'claim_number' in numerical_cols:
        numerical_cols.remove('claim_number')
    
    # Initialize transformers
    onehot = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    scaler = StandardScaler()
    
    # Fit transformers
    onehot.fit(train_df[categorical_cols])
    scaler.fit(train_df[numerical_cols])
    
    # print(f"Fitted on {len(categorical_cols)} categorical and {len(numerical_cols)} numerical columns.")
    
    return onehot, scaler, categorical_cols, numerical_cols

def transform_regular_set(df, onehot, scaler, categorical_cols, numerical_cols):
    # Transform categorical
    cat_transformed = onehot.transform(df[categorical_cols])
    cat_df = pd.DataFrame(cat_transformed, columns=onehot.get_feature_names_out(categorical_cols), index=df.index)
    
    # Transform numerical
    num_transformed = scaler.transform(df[numerical_cols])
    num_df = pd.DataFrame(num_transformed, columns=numerical_cols, index=df.index)
    
    # Combine transformed parts
    transformed_df = pd.concat([num_df, cat_df], axis=1)
    
    # print(f"Transformed set shape: {transformed_df.shape}")
    return transformed_df

In [14]:
def objective(trial, train_df, presence_info_df, target, kfoldcv=5, drop=[]):
    # difference_min = trial.suggest_float('difference_min', 0.03, 0.07)
    # info_min = trial.suggest_float('info_min', 0.01, 0.05)
    difference_min = 0.05
    info_min = 0.03
    presence_info_df = presence_info_df[(np.abs(presence_info_df['difference']) > difference_min) 
                                        & (presence_info_df['info'] > info_min)]

    # Save the used values to trial attributes
    # trial.set_user_attr('difference_min', difference_min)
    # trial.set_user_attr('info_min', info_min)
    
    
    updated_train_df = add_presence_columns(train_df, presence_info_df)
    present_cols = [col for col in updated_train_df.columns if col.endswith('_present')]
    
    high_dim_cat_cols_to_drop = ['claim_date.day', 'claim_date.dayofweek', 'claim_date.weekofyear', 'claim_date.month', 'zero_payout']
    updated_train_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True)
    
    # Step 1: Fit on training data
    onehot, scaler, cat_cols, num_cols = fit_regular_transformer(updated_train_df)

    # Step 2: Transform training set itself
    X_train_regular = transform_regular_set(updated_train_df, onehot, scaler, cat_cols, num_cols)
    
    full_train_df = pd.concat([X_train_regular, updated_train_df[present_cols]], axis=1)
    
    # Define hyperparameter space
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'is_unbalance': True,
        'boosting': 'gbdt',
        # 'device': 'gpu', 
        'num_leaves': trial.suggest_int('num_leaves', 50, 90),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.3, 0.6),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.3, 0.9),
        'bagging_freq': trial.suggest_int('bagging_freq', 5, 25),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-3, 20.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 25.0, log=True),
        'verbose': -1,
        'seed':69
    }

    skf = StratifiedKFold(n_splits=kfoldcv, shuffle=True, random_state=42)
    best_thresholds = []
    f1_scores = []

    for train_idx, val_idx in skf.split(full_train_df, target):
        X_train = full_train_df.iloc[train_idx].drop(columns=drop, errors='ignore')
        X_val = full_train_df.iloc[val_idx].drop(columns=drop, errors='ignore')
        y_train = target.iloc[train_idx]
        y_val = target.iloc[val_idx]

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

        model = lgb.train(
                params,
                lgb_train,
                num_boost_round=2000,
                valid_sets=[lgb_val],
                callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
            )

        # Predict + threshold tuning
        probs = model.predict(X_val, num_iteration=model.best_iteration)
        thresholds = np.linspace(0.1, 0.9, 50)
        f1s = [f1_score(y_val, probs > t) for t in thresholds]

        best_f1 = max(f1s)
        best_threshold = thresholds[np.argmax(f1s)]

        f1_scores.append(best_f1)
        best_thresholds.append(best_threshold)

    mean_f1 = np.mean(f1_scores)
    mean_threshold = np.mean(best_thresholds)

    trial.set_user_attr('mean_threshold', mean_threshold)
    trial.set_user_attr('f1_per_fold', f1_scores)

    return mean_f1



In [15]:
presence_info_df_3 = pd.read_csv('logs/subset_info_3.csv')
# Run Optuna
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(lambda trial: objective(trial, train_df, presence_info_df_3, target, kfoldcv= 5),
                n_trials=350)

best_f1 = study.best_value

[I 2025-05-06 23:16:33,605] A new study created in memory with name: no-name-df106f50-cd2a-4731-b039-c49fdb22c249
[I 2025-05-06 23:16:47,491] Trial 0 finished with value: 0.3590430370316729 and parameters: {'num_leaves': 63, 'feature_fraction': 0.36243835556914783, 'bagging_fraction': 0.35301386803461887, 'bagging_freq': 7, 'learning_rate': 0.06893248208142541, 'lambda_l1': 1.7262386475520366, 'lambda_l2': 0.16880017632158723}. Best is trial 0 with value: 0.3590430370316729.
[I 2025-05-06 23:17:02,381] Trial 1 finished with value: 0.3586454243766305 and parameters: {'num_leaves': 66, 'feature_fraction': 0.5443695948486299, 'bagging_fraction': 0.3863385071958297, 'bagging_freq': 14, 'learning_rate': 0.053677782933964795, 'lambda_l1': 0.0030624648983531753, 'lambda_l2': 0.001039954431997392}. Best is trial 0 with value: 0.3590430370316729.
[I 2025-05-06 23:17:13,538] Trial 2 finished with value: 0.3683525571730963 and parameters: {'num_leaves': 83, 'feature_fraction': 0.39677118211979145

In [16]:
top_trials = sorted(study.trials, key=lambda t: t.value, reverse=True)[:5]  # For minimization
# or reverse=True if you're maximizing

# Print parameters of top trials
for i, trial in enumerate(top_trials, 1):
    print(f"Top {i}:")
    print(f"  Value: {trial.value}")
    print(f"  Params: {trial.params}")

Top 1:
  Value: 0.3795695325525016
  Params: {'num_leaves': 71, 'feature_fraction': 0.31996887722138695, 'bagging_fraction': 0.5372418948817358, 'bagging_freq': 19, 'learning_rate': 0.01755632876200545, 'lambda_l1': 16.571562337276383, 'lambda_l2': 0.06314341983967049}
Top 2:
  Value: 0.3780331700561166
  Params: {'num_leaves': 71, 'feature_fraction': 0.32139089268605203, 'bagging_fraction': 0.5443715881760666, 'bagging_freq': 19, 'learning_rate': 0.01827703567233135, 'lambda_l1': 17.32469109675606, 'lambda_l2': 0.03236988976141566}
Top 3:
  Value: 0.3776318092000044
  Params: {'num_leaves': 72, 'feature_fraction': 0.3213604328748137, 'bagging_fraction': 0.5416808037070172, 'bagging_freq': 20, 'learning_rate': 0.020400694688207682, 'lambda_l1': 11.301622338125616, 'lambda_l2': 0.07493376850261091}
Top 4:
  Value: 0.3775918539284757
  Params: {'num_leaves': 72, 'feature_fraction': 0.3085805740796781, 'bagging_fraction': 0.5234130947028174, 'bagging_freq': 20, 'learning_rate': 0.01640640

In [19]:
# difference_min, info_min = study.best_trial.user_attrs['difference_min'], study.best_trial.user_attrs['info_min']
difference_min, info_min = 0.05, 0.03

presence_info_df_3 = presence_info_df_3[(np.abs(presence_info_df_3['difference']) > difference_min) & (presence_info_df_3['info'] > info_min)]
presence_info_df = presence_info_df_3

updated_train_df = add_presence_columns(train_df, presence_info_df)
updated_test_df = add_presence_columns(test_df, presence_info_df)

present_cols = [col for col in updated_train_df.columns if col.endswith('_present')]
# print(updated_train_df[non_present_cols].info())


high_dim_cat_cols_to_drop = ['claim_date.day', 'claim_date.dayofweek', 'claim_date.weekofyear', 'claim_date.month', 'zero_payout']
updated_train_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True)
updated_test_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True)

# Step 1: Fit on training data
onehot, scaler, cat_cols, num_cols = fit_regular_transformer(updated_train_df)

# Step 2: Transform training set itself
X_train_regular = transform_regular_set(updated_train_df, onehot, scaler, cat_cols, num_cols)

# Step 3: Transform test set (call the same function on test_df)
X_test_regular = transform_regular_set(test_df, onehot, scaler, cat_cols, num_cols)

# present_cols = []
# Combine for train
updated_train_final = pd.concat([X_train_regular, updated_train_df[present_cols]], axis=1)

# Combine for test
updated_test_final = pd.concat([X_test_regular, updated_test_df[present_cols]], axis=1)

In [20]:
# Paths
import json
output_dir = '../Records/lgb_temp'
train_csv_path = os.path.join(output_dir, 'train_2025.csv')
test_csv_path = os.path.join(output_dir, 'test_2025.csv')
param_json_path = os.path.join(output_dir, 'param_lgb_temp.json')

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Save DataFrames
train_df.to_csv(train_csv_path, index=False)
print(f"Saved train DataFrame to {train_csv_path}")

test_df.to_csv(test_csv_path, index=False)
print(f"Saved test DataFrame to {test_csv_path}")

# Save best_params as JSON
best_threshold = study.best_trial.user_attrs['mean_threshold']
best_params = study.best_params
best_params.update({'mean_threshold': float(best_threshold)})
with open(param_json_path, 'w') as f:
    json.dump(best_params, f, indent=4)
print(f"Saved best parameters to {param_json_path}")

Saved train DataFrame to ../Records/lgb_temp\train_2025.csv
Saved test DataFrame to ../Records/lgb_temp\test_2025.csv
Saved best parameters to ../Records/lgb_temp\param_lgb_temp.json


In [25]:
# Split train into final train and validation sets
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    updated_train_final, target, test_size=0.01, stratify=target, random_state=42
)

# Create LightGBM datasets
lgb_train_final = lgb.Dataset(X_train_final, y_train_final)
lgb_val_final = lgb.Dataset(X_val_final, y_val_final, reference=lgb_train_final)

# Ensure best_params includes all needed keys
best_params.update({
    'objective': 'binary',
    'metric': ['auc'],
    'is_unbalance': True,
    'boosting': 'gbdt',
    'device': 'gpu', 
    'verbose': -1,
    'seed': 69
})

# Train with early stopping
final_model = lgb.train(
    best_params,
    lgb_train_final,
    num_boost_round=5000,
    valid_sets=[lgb_val_final],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100)  # Optional: adjust or silence logging
    ]
)

# Predict on test set
probs_test = final_model.predict(updated_test_final, num_iteration=final_model.best_iteration)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.694314
[200]	valid_0's auc: 0.700423
[300]	valid_0's auc: 0.703947
[400]	valid_0's auc: 0.708647
[500]	valid_0's auc: 0.709117
[600]	valid_0's auc: 0.708647
Early stopping, best iteration is:
[536]	valid_0's auc: 0.711936


In [26]:
timestamp = datetime.now().strftime('%m%d_%H%M')

final_preds = (probs_test > best_threshold).astype(int)

submission = pd.DataFrame({
    'claim_number': test_id,
    'fraud': final_preds
})

submission.to_csv(f'../Submit/submission_lgb.csv', index=False)