In [10]:
import numpy as np 
import pandas as pd 
import sys
import os
from sklearn.model_selection import train_test_split
import optuna
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pylab as plt
import warnings
from datetime import datetime
import itertools
from scipy import stats
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize']=10,20

# Add the grandparent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../')))
from Utils import FE_helper as FE

In [11]:
# 2. Load the data
train_df = pd.read_csv('../Original_Data/train_2025.csv') 
test_df = pd.read_csv('../Original_Data/test_2025.csv')

train_df = FE.add_features(train_df)
test_df = FE.add_features(test_df)

test_id = test_df['claim_number']
train_id = train_df['claim_number']
target = train_df['fraud']

ignore_var = ['claim_date.is_weekend', 'claim_date.near_holiday', 'fraud']
train_df = FE.drop_ignored_columns(train_df, ignore_var)
test_df = FE.drop_ignored_columns(test_df, ignore_var)

presence_info_df_3 = pd.read_csv('logs/subset_info_3.csv')

In [17]:
def add_presence_columns(train_df, presence_info_df, verbose = False):
    """
    For each combo in presence_info_df, create a presence feature on train_df.
    
    Parameters:
    - train_df (pd.DataFrame): The training dataset.
    - presence_info_df (pd.DataFrame): DataFrame containing 'feature' column
      with names like 'feature1__feature2__feature3_present'.
    
    Returns:
    - pd.DataFrame: train_df with new presence columns added.
    """
    df_out = train_df.copy()
    
    for combo_str in presence_info_df['feature']:
        # Extract base combo name (strip trailing '_present')
        if combo_str.endswith('_present'):
            combo_base = combo_str[:-8]
        else:
            combo_base = combo_str
        
        # Split by '__' to get the individual features
        combo_features = combo_base.split('__')
        new_col_name = combo_base + '_present'  # keep consistent
        if verbose:
            print(f"Processing combo: {combo_features}")
        
        # Build tuple of feature values per row
        combo_tuples = train_df[combo_features].apply(tuple, axis=1)
        
        # Count how many times each tuple appears
        counts = combo_tuples.map(combo_tuples.value_counts())
        
        # Presence = appears more than once in the dataset
        df_out[new_col_name] = (counts > 1).astype(int)
    
    return df_out


def fit_presence_pca(train_df, present_cols, n_components=None, scale=True):
    """
    Fits PCA on presence feature columns in the training set.
    
    Parameters:
    - train_df (pd.DataFrame): Training dataset.
    - present_cols (list): List of column names for presence features.
    - n_components (int or None): Number of PCA components. If None, keep all.
    - scale (bool): Whether to standardize columns before PCA.
    
    Returns:
    - pca (PCA object): Fitted PCA object.
    - X_train_pca (np.ndarray): Transformed training set (PCA scores).
    - scaler (StandardScaler object or None): Fitted scaler if used, else None.
    """
    X = train_df[present_cols].values

    # Optionally scale features
    scaler = None
    if scale:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

    # Fit PCA
    pca = PCA(n_components=n_components, random_state=42)
    X_train_pca = pca.fit_transform(X)

    print(f"PCA fitted. Explained variance (first 10 components): {pca.explained_variance_ratio_[:10]}")
    return pca, X_train_pca, scaler


import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

def fit_regular_transformer(train_df, presence_suffix='_present'):
    # Identify regular columns
    regular_cols = [col for col in train_df.columns if not col.endswith(presence_suffix)]
    
    # Split regular into categorical and numerical
    categorical_cols = train_df[regular_cols].select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = train_df[regular_cols].select_dtypes(include=['number']).columns.tolist()
    if 'claim_number' in numerical_cols:
        numerical_cols.remove('claim_number')
    
    # Initialize transformers
    onehot = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    scaler = StandardScaler()
    
    # Fit transformers
    onehot.fit(train_df[categorical_cols])
    scaler.fit(train_df[numerical_cols])
    
    # print(f"Fitted on {len(categorical_cols)} categorical and {len(numerical_cols)} numerical columns.")
    
    return onehot, scaler, categorical_cols, numerical_cols

def transform_regular_set(df, onehot, scaler, categorical_cols, numerical_cols):
    # Transform categorical
    cat_transformed = onehot.transform(df[categorical_cols])
    cat_df = pd.DataFrame(cat_transformed, columns=onehot.get_feature_names_out(categorical_cols), index=df.index)
    
    # Transform numerical
    num_transformed = scaler.transform(df[numerical_cols])
    num_df = pd.DataFrame(num_transformed, columns=numerical_cols, index=df.index)
    
    # Combine transformed parts
    transformed_df = pd.concat([num_df, cat_df], axis=1)
    
    # print(f"Transformed set shape: {transformed_df.shape}")
    return transformed_df


In [26]:
def objective(trial, train_df, presence_info_df, target, kfoldcv=5, drop=[]):
    # Dynamic feature selection thresholds
    # difference_min = trial.suggest_float('difference_min', 0.03, 0.07)
    # info_min = trial.suggest_float('info_min', 0.03, 0.05)
    difference_min = 0.05
    info_min = 0.03
    presence_info_df = presence_info_df[
        (np.abs(presence_info_df['difference']) > difference_min) &
        (presence_info_df['info'] > info_min)
    ]

    # trial.set_user_attr('difference_min', difference_min)
    # trial.set_user_attr('info_min', info_min)

    # Add presence features
    updated_train_df = add_presence_columns(train_df, presence_info_df)
    present_cols = [col for col in updated_train_df.columns if col.endswith('_present')]

    # Drop problematic high-cardinality categorical columns
    high_dim_cat_cols_to_drop = ['claim_date.day', 'claim_date.dayofweek', 'claim_date.weekofyear',
    'claim_date.month', 'zero_payout', 'zero_payout']
    updated_train_df.drop(columns=high_dim_cat_cols_to_drop, inplace=True)

    # Transform regular features
    onehot, scaler, cat_cols, num_cols = fit_regular_transformer(updated_train_df)
    X_train_regular = transform_regular_set(updated_train_df, onehot, scaler, cat_cols, num_cols)

    # Combine with presence features
    full_train_df = pd.concat([X_train_regular, updated_train_df[present_cols]], axis=1)

    # Define hyperparameter space
    params = {
        'iterations': 2000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 25.0, log=True),
        'random_seed': 69,
        'loss_function': 'Logloss',
        'eval_metric': 'Logloss',
        'verbose': False,
        'task_type': 'GPU'  # change to 'GPU' if using GPU
    }


    skf = StratifiedKFold(n_splits=kfoldcv, shuffle=True, random_state=42)
    best_thresholds = []
    f1_scores = []

    for train_idx, val_idx in skf.split(full_train_df, target):
        X_train = full_train_df.iloc[train_idx].drop(columns=drop, errors='ignore')
        X_val = full_train_df.iloc[val_idx].drop(columns=drop, errors='ignore')
        y_train = target.iloc[train_idx]
        y_val = target.iloc[val_idx]
        
        train_pool = Pool(X_train, y_train)
        val_pool = Pool(X_val, y_val)

        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50)

        # Predict + threshold tuning
        probs = model.predict_proba(X_val)[:, 1]
        thresholds = np.linspace(0.1, 0.9, 50)
        f1s = [f1_score(y_val, probs > t) for t in thresholds]

        best_f1 = max(f1s)
        best_threshold = thresholds[np.argmax(f1s)]

        f1_scores.append(best_f1)
        best_thresholds.append(best_threshold)

    mean_f1 = np.mean(f1_scores)
    mean_threshold = np.mean(best_thresholds)

    trial.set_user_attr('mean_threshold', mean_threshold)
    trial.set_user_attr('f1_per_fold', f1_scores)

    return mean_f1


In [27]:
presence_info_df_3 = pd.read_csv('logs/subset_info_3.csv')
# Run Optuna
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(lambda trial: objective(trial, train_df, presence_info_df_3, target, kfoldcv= 5),
                n_trials=350)


best_f1 = study.best_value

[I 2025-05-06 22:03:27,517] A new study created in memory with name: no-name-424011bc-f1e2-42ca-b820-7adcf0aff4d0
[I 2025-05-06 22:07:53,272] Trial 0 finished with value: 0.3702738877237509 and parameters: {'learning_rate': 0.012151459447326506, 'depth': 9, 'l2_leaf_reg': 0.005596719677161326}. Best is trial 0 with value: 0.3702738877237509.
[I 2025-05-06 22:09:33,626] Trial 1 finished with value: 0.3748606214118541 and parameters: {'learning_rate': 0.05144740862482892, 'depth': 3, 'l2_leaf_reg': 12.799904108569352}. Best is trial 1 with value: 0.3748606214118541.
[I 2025-05-06 22:10:43,575] Trial 2 finished with value: 0.3728339452289898 and parameters: {'learning_rate': 0.08554421169762018, 'depth': 9, 'l2_leaf_reg': 0.6047961580819913}. Best is trial 1 with value: 0.3748606214118541.
[I 2025-05-06 22:11:26,390] Trial 3 finished with value: 0.38032983598759956 and parameters: {'learning_rate': 0.08982639660888696, 'depth': 3, 'l2_leaf_reg': 0.20168909054330955}. Best is trial 3 with 

In [28]:
top_trials = sorted(study.trials, key=lambda t: t.value, reverse=True)[:5]  # For minimization
# or reverse=True if you're maximizing

# Print parameters of top trials
for i, trial in enumerate(top_trials, 1):
    print(f"Top {i}:")
    print(f"  Value: {trial.value}")
    print(f"  Params: {trial.params}")

Top 1:
  Value: 0.38272329859193543
  Params: {'learning_rate': 0.0894488970066411, 'depth': 3, 'l2_leaf_reg': 0.04861123719686134}
Top 2:
  Value: 0.3817964812636329
  Params: {'learning_rate': 0.08943465486972843, 'depth': 3, 'l2_leaf_reg': 0.01622233983575132}
Top 3:
  Value: 0.3816318379687743
  Params: {'learning_rate': 0.0942074610364229, 'depth': 3, 'l2_leaf_reg': 0.02262443859997675}
Top 4:
  Value: 0.3816004692220901
  Params: {'learning_rate': 0.09930570612937266, 'depth': 3, 'l2_leaf_reg': 0.020862748392985343}
Top 5:
  Value: 0.38159167374111747
  Params: {'learning_rate': 0.09099690675949326, 'depth': 3, 'l2_leaf_reg': 0.017749181003040317}


In [30]:
# difference_min, info_min = study.best_trial.user_attrs['difference_min'], study.best_trial.user_attrs['info_min']
difference_min, info_min = 0.05, 0.03

presence_info_df_3 = presence_info_df_3[(np.abs(presence_info_df_3['difference']) > difference_min) & (presence_info_df_3['info'] > info_min)]
presence_info_df = presence_info_df_3

updated_train_df = add_presence_columns(train_df, presence_info_df)
updated_test_df = add_presence_columns(test_df, presence_info_df)

present_cols = [col for col in updated_train_df.columns if col.endswith('_present')]
# print(updated_train_df[non_present_cols].info())


high_dim_cat_cols_to_drop = ['claim_date.day', 'claim_date.dayofweek', 'claim_date.weekofyear', 'claim_date.month', 'zero_payout']
updated_train_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True)
updated_test_df.drop(columns = high_dim_cat_cols_to_drop, inplace=True)

# Step 1: Fit on training data
onehot, scaler, cat_cols, num_cols = fit_regular_transformer(updated_train_df)

# Step 2: Transform training set itself
X_train_regular = transform_regular_set(updated_train_df, onehot, scaler, cat_cols, num_cols)

# Step 3: Transform test set (call the same function on test_df)
X_test_regular = transform_regular_set(test_df, onehot, scaler, cat_cols, num_cols)

# present_cols = []
# Combine for train
updated_train_final = pd.concat([X_train_regular, updated_train_df[present_cols]], axis=1)

# Combine for test
updated_test_final = pd.concat([X_test_regular, updated_test_df[present_cols]], axis=1)

In [31]:
# Paths
import json
output_dir = '../Records/ctb_temp'
train_csv_path = os.path.join(output_dir, 'train_2025.csv')
test_csv_path = os.path.join(output_dir, 'test_2025.csv')
param_json_path = os.path.join(output_dir, 'param_ctb_temp.json')

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Save DataFrames
train_df.to_csv(train_csv_path, index=False)
print(f"Saved train DataFrame to {train_csv_path}")

test_df.to_csv(test_csv_path, index=False)
print(f"Saved test DataFrame to {test_csv_path}")

# Save best_params as JSON
best_threshold = study.best_trial.user_attrs['mean_threshold']
best_params = study.best_params
best_params.update({'mean_threshold': float(best_threshold)})
with open(param_json_path, 'w') as f:
    json.dump(best_params, f, indent=4)
print(f"Saved best parameters to {param_json_path}")

Saved train DataFrame to ../Records/ctb_temp\train_2025.csv
Saved test DataFrame to ../Records/ctb_temp\test_2025.csv
Saved best parameters to ../Records/ctb_temp\param_ctb_temp.json


In [32]:

# Split train into final train and validation sets
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    updated_train_final, target, test_size=0.01, stratify=target, random_state=42
)

# Prepare CatBoost parameters (matching what you used before)
cat_params = {
    'iterations': 5000,
    'learning_rate': best_params.get('learning_rate', 0.05),  # fallback value if not in best_params
    'depth': best_params.get('depth', 6),
    'l2_leaf_reg': best_params.get('l2_leaf_reg', 3.0),
    'random_seed': 69,
    'loss_function': 'Logloss',
    'eval_metric': 'Logloss',
    'verbose': 100,          # adjust or set False for silent
    'early_stopping_rounds': 100,
    'task_type': 'GPU'       # or 'CPU' if needed
}

# Set up Pools
train_pool = Pool(X_train_final, y_train_final)
val_pool = Pool(X_val_final, y_val_final)
test_pool = Pool(updated_test_final)

# Initialize and train CatBoost model
final_model = CatBoostClassifier(**cat_params)
final_model.fit(
    train_pool,
    eval_set=val_pool,
    verbose=100,  # or False for silent
)

# Predict on test set (probabilities)
probs_test = final_model.predict_proba(test_pool)[:, 1]

0:	learn: 0.6394335	test: 0.6386021	best: 0.6386021 (0)	total: 26.2ms	remaining: 2m 11s
100:	learn: 0.3910573	test: 0.3905850	best: 0.3905548 (99)	total: 2.58s	remaining: 2m 5s
200:	learn: 0.3836591	test: 0.3858616	best: 0.3857296 (194)	total: 5.12s	remaining: 2m 2s
300:	learn: 0.3803340	test: 0.3815168	best: 0.3815112 (299)	total: 7.67s	remaining: 1m 59s
400:	learn: 0.3760277	test: 0.3835581	best: 0.3812479 (305)	total: 10.2s	remaining: 1m 57s
bestTest = 0.3812479231
bestIteration = 305
Shrink model to first 306 iterations.


In [34]:
timestamp = datetime.now().strftime('%m%d_%H%M')

final_preds = (probs_test > best_threshold).astype(int)

submission = pd.DataFrame({
    'claim_number': test_id,
    'fraud': final_preds
})

submission.to_csv(f'../Submit/submission_ctb.csv', index=False)