In [1]:
import numpy as np 
import pandas as pd 
import sys
import os
from sklearn.model_selection import train_test_split
import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pylab as plt
import warnings
import itertools
from scipy import stats
warnings.filterwarnings('ignore')

# Add the grandparent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../')))
from Utils import FE_helper as FE


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 2. Load the data
train_df_original = pd.read_csv('../../Original_Data/train_2025.csv') 
train_df = pd.read_csv('../../Original_Data/train_2025.csv') 
test_df = pd.read_csv('../../Original_Data/test_2025.csv')

train_df = FE.add_features(train_df)
test_df = FE.add_features(test_df)

test_id = test_df['claim_number']
train_id = train_df['claim_number']
target = train_df['fraud']

ignore_var = ['claim_date.is_weekend', 'claim_date.near_holiday', 'fraud']
train_df = FE.drop_ignored_columns(train_df, ignore_var)
test_df = FE.drop_ignored_columns(test_df, ignore_var)

In [163]:
def filter_low_cardinality_columns(df, threshold=20, dropna=False):
    """
    Filters columns with unique value counts ≤ threshold.
    """
    low_card_cols = [col for col in df.columns if df[col].nunique(dropna=dropna) <= threshold]
    # print(f"Low-cardinality columns (≤ {threshold} unique values): {low_card_cols}")
    return low_card_cols

def generate_column_combinations(columns, sizes=[2, 3]):
    """
    Generates all combinations of the given columns at specified sizes.
    """
    combos = []
    for k in sizes:
        combos.extend(itertools.combinations(columns, k))
    print(f"Generated {len(combos)} combinations (sizes {sizes}).")
    return combos


def add_presence_features(df, combos):
    """
    For each column combination, adds a binary feature:
    1 if the row's combination appears elsewhere in the dataset, 0 otherwise.
    Shows a progress indicator.
    """
    df_out = df.copy()
    total = len(combos)

    for i, combo in enumerate(combos, 1):
        combo_name = "__".join(combo) + "_present"
        
        # Progress indicator
        progress_msg = f"\rProcessing {i} / {total} combos ({100 * i / total:.2f}%)"
        sys.stdout.write(progress_msg)
        sys.stdout.flush()
        
        # Create a tuple column for matching
        combo_tuples = df[list(combo)].apply(tuple, axis=1)
        # Count how many times each tuple appears
        counts = combo_tuples.map(combo_tuples.value_counts())
        # Presence = appears more than once
        df_out[combo_name] = (counts > 1).astype(int)
    
    # Final newline to clean up progress line
    sys.stdout.write("\nDone!\n")

    return df_out

def compute_presence_counts(df, combos):
    """
    Computes the count matrix for each combo (how many times each combo appears).

    Parameters:
    ----------
    df : pd.DataFrame
        Input DataFrame.
    combos : list of tuples
        List of column combinations to create presence features from.

    Returns:
    -------
    pd.DataFrame
        A DataFrame where each column is the counts for that combo.
    """
    df_out = df.copy()
    total = len(combos)

    for i, combo in enumerate(combos, 1):
        combo_name = "__".join(combo) + "_count"

        # Progress indicator
        progress_msg = f"\rProcessing {i} / {total} combos ({100 * i / total:.2f}%)"
        sys.stdout.write(progress_msg)
        sys.stdout.flush()

        combo_tuples = df[list(combo)].apply(tuple, axis=1)
        counts = combo_tuples.map(combo_tuples.value_counts())

        df_out[combo_name] = counts

    sys.stdout.write("\n✅ Done computing counts matrix.\n")
    return df_out

def generate_all_nonempty_subsets(features):
    """
    Generates all nonempty subsets (combinations) of the given feature list.
    
    Parameters:
    - features (list): List of feature names.
    
    Returns:
    - list of tuples: All nonempty subsets.
    """
    all_subsets = []
    for k in range(1, len(features) + 1):
        combos = list(itertools.combinations(features, k))
        all_subsets.extend(combos)
    print(f"Generated {len(all_subsets)} total nonempty subsets.")
    return all_subsets

def compute_fraud_rate_differences(df, target_col='fraud', suffix='_present', variance_threshold=None):
    """
    Computes the difference in fraud rates between rows with feature == 1 and feature == 0
    for all columns ending with the given suffix.
    
    Parameters:
    - df (pd.DataFrame): Input DataFrame
    - target_col (str): Name of the target binary column (e.g., 'fraud')
    - suffix (str): Suffix to identify newly added features
    - filter_zero_variance (bool): Whether to exclude columns with zero variance
    
    Returns:
    - pd.DataFrame: Feature name, fraud rate at 0, fraud rate at 1, and the difference
    """
    results = []

    # Select columns with the specified suffix
    feature_cols = [col for col in df.columns if col.endswith(suffix)]

    if variance_threshold is not None:
        variances = df[feature_cols].var()
        feature_cols = [col for col in feature_cols if variances[col] > variance_threshold]
        print(f"Kept {len(feature_cols)} higher-variance features.")

    for col in feature_cols:
        grouped = df.groupby(col)[target_col].mean()
        rate_0 = grouped.get(0, None)
        rate_1 = grouped.get(1, None)
        if rate_0 is not None and rate_1 is not None:
            diff = rate_1 - rate_0
            var = df[col].var()
            results.append({
                'feature': col,
                'fraud_rate_at_0': rate_0,
                'fraud_rate_at_1': rate_1,
                'difference': diff,
                'variance': var
            })

    results_df = pd.DataFrame(results).sort_values(by='difference', key=abs, ascending=False)
    return results_df

compute_presence_counts takes a ton of resources. here is a quick test case if want to be assured that it works

In [174]:
# df = pd.DataFrame({
#     'gender': ['M', 'F', 'M', 'M', 'F', 'M'],
#     'age':    [25, 30, 25, 40, 30, 25],
#     'city':   ['NY', 'SF', 'NY', 'LA', 'SF', 'NY']
# })

# combos = [
#     ('gender', 'age'),
#     ('gender', 'city')
# ]

# counts_df = compute_presence_counts(df, combos)
# counts_df

This block takes really long to run... don't run unless necessary

In [None]:
# additional_drops = ['vehicle_price_categories', 'log_pop_bin', 'age_group', 'past_num_of_claims', 'age_of_vehicle', 'claim_date.weekofyear', 'claim_date.day', 'claim_date.quarter', 'zipcode_type']

# low_performing_col = train_df[filter_low_cardinality_columns(train_df, threshold=60)].drop(columns = additional_drops).columns.tolist()

# # Step 2: Generate all 2- and 3-column combinations
# combos = generate_all_nonempty_subsets(low_performing_col)

# # Step 3: Add binary presence features
# df_with_features = compute_presence_counts(train_df, combos)

# print("New dataset shape:", df_with_features.shape)

# df_with_features.to_pickle('df_with_features.pkl')

Generated 32767 total nonempty subsets.
Processing 32767 / 32767 combos (100.00%)
✅ Done computing counts matrix.
New dataset shape: (18000, 32809)


In [None]:
feature_cols = [col for col in df_with_features.columns if col.endswith('_count')]
temp = (df_with_features[feature_cols] > 10).astype(int)
temp['fraud'] = target
temp_diff_info = compute_fraud_rate_differences(temp, suffix='_count')

In [None]:
temp_diff_info[(temp_diff_info['variance'] > 0.1) & (temp_diff_info['difference'] > 0.062)].sort_values(by = 'variance')

Unnamed: 0,feature,fraud_rate_at_0,fraud_rate_at_1,difference,variance
803,witness_present_ind__vehicle_color__claim_date...,0.102854,0.165268,0.062414,0.100151
4438,gender__address_change_ind__witness_present_in...,0.100684,0.165601,0.064917,0.100752
7069,address_change_ind__living_status__witness_pre...,0.100175,0.166667,0.066492,0.110877
4427,gender__address_change_ind__witness_present_in...,0.10276,0.166571,0.063811,0.113722
3059,witness_present_ind__policy_report_filed_ind__...,0.097345,0.167467,0.070121,0.11446
3073,witness_present_ind__policy_report_filed_ind__...,0.096896,0.167585,0.070689,0.114909
3065,witness_present_ind__policy_report_filed_ind__...,0.10395,0.166592,0.062642,0.115766
2649,address_change_ind__witness_present_ind__vehic...,0.103599,0.168077,0.064478,0.129482
1448,gender__witness_present_ind__vehicle_color__cl...,0.105828,0.167862,0.062034,0.13125
2641,address_change_ind__witness_present_ind__vehic...,0.1,0.168993,0.068993,0.131748


In [408]:
newname = 'subset_info_10'
new_subset_name = f'{newname}.csv'

temp_diff_info.to_csv(f'../../Create_Tune_Models/logs/{new_subset_name}', index = False)

In [120]:
newname = 'subset_info_5'
new_subset_name = f'{newname}.csv'

df_with_features['fraud'] = target
fraud_diffs = compute_fraud_rate_differences(df_with_features, target_col='fraud', variance_threshold=0.05)
temp_1 = fraud_diffs[np.abs(fraud_diffs['difference']) > 0.03]

temp_1['info'] = (temp_1['variance']/0.25)*np.abs(temp_1['difference'])
temp_1 = temp_1.sort_values(by='info', ascending=False)
temp_1.to_csv(f'../../Create_Tune_Models/logs/{new_subset_name}', index = False)

Kept 14783 higher-variance features.


In [146]:
presence_info_df = temp_1

difference_min = 0.06
info_min = 0.043
presence_info_df = presence_info_df[
    (np.abs(presence_info_df['difference']) > difference_min) &
    (presence_info_df['info'] > info_min)
]

In [None]:
low_card_col = filter_low_cardinality_columns(train_df, threshold=120)

custom_cols = ['claim_date.weekofyear', 'claim_date.quarter', 'log_pop_bin', 'vehicle_price_categories', 'zero_payout']
low_card_col = [col for col in low_card_col if col not in custom_cols]


# Step 2: Generate all 2- and 3-column combinations
combos_2345 = generate_column_combinations(low_card_col, [2, 3, 4, 5])

# Step 3: Add binary presence features
df_with_features_2345 = add_presence_features(train_df, combos_2345)

print("New dataset shape:", df_with_features_2345.shape)

In [None]:
df_with_features_2345['fraud'] = target
fraud_diffs_2345 = compute_fraud_rate_differences(df_with_features_2345, target_col='fraud', variance_threshold=0.1)
temp_2 = fraud_diffs_2345[np.abs(fraud_diffs_2345['difference']) > 0.03]

temp_2['info'] = (temp_2['variance']/0.25)*np.abs(temp_2['difference'])
temp_2 = temp_2.sort_values(by='info', ascending=False)
temp_2.to_csv('../../Create_Tune_Models/logs/subset_info_2.csv', index = False)