In [2]:
import numpy as np 
import pandas as pd 
import sys
import os
from sklearn.model_selection import train_test_split
import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.decomposition import PCA
import matplotlib.pylab as plt
import warnings
from datetime import datetime
import itertools
from scipy import stats

warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize']=10,20

# Add the grandparent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../')))
from Utils import FE_helper as FE


In [3]:
# 2. Load the data
train_df = pd.read_csv('../Original_Data/train_2025.csv') 
test_df = pd.read_csv('../Original_Data/test_2025.csv')

train_df = FE.add_features(train_df)
test_df = FE.add_features(test_df)

test_id = test_df['claim_number']
train_id = train_df['claim_number']
target = train_df['fraud']

ignore_var = ['claim_date.is_weekend', 'claim_date.near_holiday', 'fraud']
train_df = FE.drop_ignored_columns(train_df, ignore_var)
test_df = FE.drop_ignored_columns(test_df, ignore_var)

In [4]:
def add_presence_columns(train_df, presence_info_df, verbose = False):
    """
    For each combo in presence_info_df, create a presence feature on train_df.
    
    Parameters:
    - train_df (pd.DataFrame): The training dataset.
    - presence_info_df (pd.DataFrame): DataFrame containing 'feature' column
      with names like 'feature1__feature2__feature3_present'.
    
    Returns:
    - pd.DataFrame: train_df with new presence columns added.
    """
    df_out = train_df.copy()
    
    for combo_str in presence_info_df['feature']:
        # Extract base combo name (strip trailing '_present')
        if combo_str.endswith('_present'):
            combo_base = combo_str[:-8]
        else:
            combo_base = combo_str
        
        # Split by '__' to get the individual features
        combo_features = combo_base.split('__')
        new_col_name = combo_base + '_present'  # keep consistent
        if verbose:
            print(f"Processing combo: {combo_features}")
        
        # Build tuple of feature values per row
        combo_tuples = train_df[combo_features].apply(tuple, axis=1)
        
        # Count how many times each tuple appears
        counts = combo_tuples.map(combo_tuples.value_counts())
        
        # Presence = appears more than once in the dataset
        df_out[new_col_name] = (counts > 1).astype(int)
    
    return df_out


def fit_presence_pca(train_df, present_cols, n_components=None, scale=True):
    """
    Fits PCA on presence feature columns in the training set.
    
    Parameters:
    - train_df (pd.DataFrame): Training dataset.
    - present_cols (list): List of column names for presence features.
    - n_components (int or None): Number of PCA components. If None, keep all.
    - scale (bool): Whether to standardize columns before PCA.
    
    Returns:
    - pca (PCA object): Fitted PCA object.
    - X_train_pca (np.ndarray): Transformed training set (PCA scores).
    - scaler (StandardScaler object or None): Fitted scaler if used, else None.
    """
    X = train_df[present_cols].values

    # Optionally scale features
    scaler = None
    if scale:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

    # Fit PCA
    pca = PCA(n_components=n_components, random_state=42)
    X_train_pca = pca.fit_transform(X)

    print(f"PCA fitted. Explained variance (first 10 components): {pca.explained_variance_ratio_[:10]}")
    return pca, X_train_pca, scaler



def fit_regular_transformer(train_df, presence_suffix='_present'):
    # Identify regular columns
    regular_cols = [col for col in train_df.columns if not col.endswith(presence_suffix)]
    
    # Split regular into categorical and numerical
    categorical_cols = train_df[regular_cols].select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = train_df[regular_cols].select_dtypes(include=['number']).columns.tolist()
    if 'claim_number' in numerical_cols:
        numerical_cols.remove('claim_number')
    
    # Initialize transformers
    onehot = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    scaler = StandardScaler()
    
    # Fit transformers
    onehot.fit(train_df[categorical_cols])
    scaler.fit(train_df[numerical_cols])
    
    # print(f"Fitted on {len(categorical_cols)} categorical and {len(numerical_cols)} numerical columns.")
    
    return onehot, scaler, categorical_cols, numerical_cols

def transform_regular_set(df, onehot, scaler, categorical_cols, numerical_cols):
    # Transform categorical
    cat_transformed = onehot.transform(df[categorical_cols])
    cat_df = pd.DataFrame(cat_transformed, columns=onehot.get_feature_names_out(categorical_cols), index=df.index)
    
    # Transform numerical
    num_transformed = scaler.transform(df[numerical_cols])
    num_df = pd.DataFrame(num_transformed, columns=numerical_cols, index=df.index)
    
    # Combine transformed parts
    transformed_df = pd.concat([num_df, cat_df], axis=1)
    
    # print(f"Transformed set shape: {transformed_df.shape}")
    return transformed_df

In [None]:
def objective(trial, train_df, presence_info_df, target, kfoldcv=5, drop=[]):
    # Feature selection thresholds
    difference_min = 0.05
    info_min = 0.03
    presence_info_df = presence_info_df[
        (np.abs(presence_info_df['difference']) > difference_min) &
        (presence_info_df['info'] > info_min)
    ]

    # Add presence features
    updated_train_df = add_presence_columns(train_df, presence_info_df)
    present_cols = [col for col in updated_train_df.columns if col.endswith('_present')]

    # Drop high-cardinality categorical columns
    high_dim_cat_cols_to_drop = ['claim_date.day', 'claim_date.dayofweek', 'claim_date.weekofyear',
                                 'claim_date.month']
    updated_train_df.drop(columns=high_dim_cat_cols_to_drop, inplace=True)

    # Transform regular features
    onehot, scaler, cat_cols, num_cols = fit_regular_transformer(updated_train_df)
    X_train_regular = transform_regular_set(updated_train_df, onehot, scaler, cat_cols, num_cols)

    # Combine with presence features
    full_train_df = pd.concat([X_train_regular, updated_train_df[present_cols]], axis=1)

    # Define hyperparameter space for HGBC
    params = {
        'max_iter': 2000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 20, 90),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 10, 50),
        'l2_regularization': trial.suggest_float('l2_regularization', 1e-3, 25.0, log=True),
        'early_stopping': True,
        'validation_fraction': 0.1,
        'random_state': 69,
        'verbose': 0
    }

    skf = StratifiedKFold(n_splits=kfoldcv, shuffle=True, random_state=42)
    best_thresholds = []
    f1_scores = []

    for train_idx, val_idx in skf.split(full_train_df, target):
        X_train = full_train_df.iloc[train_idx].drop(columns=drop, errors='ignore')
        X_val = full_train_df.iloc[val_idx].drop(columns=drop, errors='ignore')
        y_train = target.iloc[train_idx]
        y_val = target.iloc[val_idx]

        # Instantiate and train the model
        model = HistGradientBoostingClassifier(**params)
        model.fit(X_train, y_train)

        # Predict + threshold tuning
        probs = model.predict_proba(X_val)[:, 1]
        thresholds = np.linspace(0.1, 0.9, 50)
        f1s = [f1_score(y_val, probs > t) for t in thresholds]

        best_f1 = max(f1s)
        best_threshold = thresholds[np.argmax(f1s)]

        f1_scores.append(best_f1)
        best_thresholds.append(best_threshold)

    mean_f1 = np.mean(f1_scores)
    mean_threshold = np.mean(best_thresholds)

    trial.set_user_attr('mean_threshold', mean_threshold)
    trial.set_user_attr('f1_per_fold', f1_scores)

    return mean_f1

In [8]:
presence_info_df_3 = pd.read_csv('logs/subset_info_3.csv')
# Run Optuna
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(lambda trial: objective(trial, train_df, presence_info_df_3, target, kfoldcv= 5),
                n_trials=350)

best_f1 = study.best_value

[I 2025-05-07 10:29:39,412] A new study created in memory with name: no-name-1bbb7d41-7a08-47ed-996f-72338ab747f9
[I 2025-05-07 10:29:48,423] Trial 0 finished with value: 0.3619930822892148 and parameters: {'learning_rate': 0.029267251208555764, 'max_leaf_nodes': 77, 'min_samples_leaf': 50, 'l2_regularization': 6.841872624812442}. Best is trial 0 with value: 0.3619930822892148.
[I 2025-05-07 10:29:54,103] Trial 1 finished with value: 0.35807020759583996 and parameters: {'learning_rate': 0.08955473704257991, 'max_leaf_nodes': 54, 'min_samples_leaf': 24, 'l2_regularization': 4.748984380510635}. Best is trial 0 with value: 0.3619930822892148.
[I 2025-05-07 10:29:59,351] Trial 2 finished with value: 0.3606258850497981 and parameters: {'learning_rate': 0.06212496879082172, 'max_leaf_nodes': 36, 'min_samples_leaf': 18, 'l2_regularization': 0.001497202596797329}. Best is trial 0 with value: 0.3619930822892148.
[I 2025-05-07 10:30:04,742] Trial 3 finished with value: 0.35919746441001665 and pa

In [9]:
top_trials = sorted(study.trials, key=lambda t: t.value, reverse=True)[:5]  # For minimization
# or reverse=True if you're maximizing

# Print parameters of top trials
for i, trial in enumerate(top_trials, 1):
    print(f"Top {i}:")
    print(f"  Value: {trial.value}")
    print(f"  Params: {trial.params}")

Top 1:
  Value: 0.37253694641539103
  Params: {'learning_rate': 0.06394868766799867, 'max_leaf_nodes': 20, 'min_samples_leaf': 17, 'l2_regularization': 22.128602074859426}
Top 2:
  Value: 0.3721541645459441
  Params: {'learning_rate': 0.06299200878209602, 'max_leaf_nodes': 20, 'min_samples_leaf': 17, 'l2_regularization': 24.922621301696072}
Top 3:
  Value: 0.37201123643592593
  Params: {'learning_rate': 0.0625476138426423, 'max_leaf_nodes': 20, 'min_samples_leaf': 21, 'l2_regularization': 20.89855368117552}
Top 4:
  Value: 0.3713671359157743
  Params: {'learning_rate': 0.06328254626185774, 'max_leaf_nodes': 22, 'min_samples_leaf': 17, 'l2_regularization': 19.21836363704397}
Top 5:
  Value: 0.3711170498977394
  Params: {'learning_rate': 0.060548338554806486, 'max_leaf_nodes': 20, 'min_samples_leaf': 19, 'l2_regularization': 17.879915954185133}


In [11]:
# difference_min, info_min = study.best_trial.user_attrs['difference_min'], study.best_trial.user_attrs['info_min']
difference_min, info_min = 0.05, 0.03

presence_info_df_3 = presence_info_df_3[(np.abs(presence_info_df_3['difference']) > difference_min) & (presence_info_df_3['info'] > info_min)]
presence_info_df = presence_info_df_3

updated_train_df = add_presence_columns(train_df, presence_info_df)
updated_test_df = add_presence_columns(test_df, presence_info_df)

present_cols = [col for col in updated_train_df.columns if col.endswith('_present')]
# print(updated_train_df[non_present_cols].info())


high_dim_cat_cols_to_drop = ['claim_date.day', 'claim_date.dayofweek', 'claim_date.weekofyear', 'claim_date.month']
updated_train_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True)
updated_test_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True)

# Step 1: Fit on training data
onehot, scaler, cat_cols, num_cols = fit_regular_transformer(updated_train_df)

# Step 2: Transform training set itself
X_train_regular = transform_regular_set(updated_train_df, onehot, scaler, cat_cols, num_cols)

# Step 3: Transform test set (call the same function on test_df)
X_test_regular = transform_regular_set(test_df, onehot, scaler, cat_cols, num_cols)

# present_cols = []
# Combine for train
updated_train_final = pd.concat([X_train_regular, updated_train_df[present_cols]], axis=1)

# Combine for test
updated_test_final = pd.concat([X_test_regular, updated_test_df[present_cols]], axis=1)

In [12]:
# Paths
import json
output_dir = '../Records/hgbc_temp'
train_csv_path = os.path.join(output_dir, 'train_2025.csv')
test_csv_path = os.path.join(output_dir, 'test_2025.csv')
param_json_path = os.path.join(output_dir, 'param_hgbc_temp.json')

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Save DataFrames
train_df.to_csv(train_csv_path, index=False)
print(f"Saved train DataFrame to {train_csv_path}")

test_df.to_csv(test_csv_path, index=False)
print(f"Saved test DataFrame to {test_csv_path}")

# Save best_params as JSON
best_threshold = study.best_trial.user_attrs['mean_threshold']
best_params = study.best_params
best_params.update({'mean_threshold': float(best_threshold)})
with open(param_json_path, 'w') as f:
    json.dump(best_params, f, indent=4)
print(f"Saved best parameters to {param_json_path}")

Saved train DataFrame to ../Records/hgbc_temp\train_2025.csv
Saved test DataFrame to ../Records/hgbc_temp\test_2025.csv
Saved best parameters to ../Records/hgbc_temp\param_hgbc_temp.json


In [22]:
# Split train into final train and validation sets
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    updated_train_final, target, test_size=0.01, stratify=target, random_state=42
)

# Combine train and validation (since HGBC handles splitting internally)
X_full = pd.concat([X_train_final, X_val_final])
y_full = pd.concat([y_train_final, y_val_final])

# Define HGBC parameters (you can include your tuned params here if needed)
hgbc_params = {
    'max_iter': 5000,               # similar to num_boost_round
    'learning_rate': best_params.get('learning_rate', 0.1),
    'max_leaf_nodes': best_params.get('num_leaves', 31),  # LightGBM num_leaves ~ HGBC max_leaf_nodes
    'max_depth': best_params.get('max_depth', None),      # Optional
    'l2_regularization': best_params.get('lambda_l2', 0.0),
    'early_stopping': True,
    'validation_fraction': len(X_val_final) / len(X_full),
    'n_iter_no_change': 100,       # early_stopping_rounds
    'random_state': 69,
    'verbose': 0
}

# Initialize and fit the model
hgbc = HistGradientBoostingClassifier(**hgbc_params)


hgbc.fit(X_full, y_full)

# Predict probabilities on test set
probs_test = hgbc.predict_proba(updated_test_final)[:, 1]


In [24]:
timestamp = datetime.now().strftime('%m%d_%H%M')

final_preds = (probs_test > best_threshold).astype(int)

submission = pd.DataFrame({
    'claim_number': test_id,
    'fraud': final_preds
})

submission.to_csv(f'../Records/hgbc_temp/submission_temp.csv', index=False)