In [4]:
import numpy as np 
import pandas as pd 
import sys
import os
from sklearn.model_selection import train_test_split
import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pylab as plt
import warnings
import itertools
from scipy import stats
warnings.filterwarnings('ignore')

# Add the grandparent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../')))
from Utils import FE_helper as FE


In [5]:
# 2. Load the data
train_df_original = pd.read_csv('../../Original_Data/train_2025.csv') 
train_df = pd.read_csv('../../Original_Data/train_2025.csv') 
test_df = pd.read_csv('../../Original_Data/test_2025.csv')

train_df = FE.add_features(train_df)
test_df = FE.add_features(test_df)

test_id = test_df['claim_number']
train_id = train_df['claim_number']
target = train_df['fraud']

ignore_var = ['claim_date.is_weekend', 'claim_date.near_holiday', 'fraud']
train_df = FE.drop_ignored_columns(train_df, ignore_var)
test_df = FE.drop_ignored_columns(test_df, ignore_var)

In [6]:
def filter_low_cardinality_columns(df, threshold=20, dropna=False):
    """
    Filters columns with unique value counts ≤ threshold.
    """
    low_card_cols = [col for col in df.columns if df[col].nunique(dropna=dropna) <= threshold]
    # print(f"Low-cardinality columns (≤ {threshold} unique values): {low_card_cols}")
    return low_card_cols

def generate_column_combinations(columns, sizes=[2, 3]):
    """
    Generates all combinations of the given columns at specified sizes.
    """
    combos = []
    for k in sizes:
        combos.extend(itertools.combinations(columns, k))
    print(f"Generated {len(combos)} combinations (sizes {sizes}).")
    return combos


def add_presence_features(df, combos):
    """
    For each column combination, adds a binary feature:
    1 if the row's combination appears elsewhere in the dataset, 0 otherwise.
    Shows a progress indicator.
    """
    df_out = df.copy()
    total = len(combos)

    for i, combo in enumerate(combos, 1):
        combo_name = "__".join(combo) + "_present"
        
        # Progress indicator
        progress_msg = f"\rProcessing {i} / {total} combos ({100 * i / total:.2f}%)"
        sys.stdout.write(progress_msg)
        sys.stdout.flush()
        
        # Create a tuple column for matching
        combo_tuples = df[list(combo)].apply(tuple, axis=1)
        # Count how many times each tuple appears
        counts = combo_tuples.map(combo_tuples.value_counts())
        # Presence = appears more than once
        df_out[combo_name] = (counts > 1).astype(int)
    
    # Final newline to clean up progress line
    sys.stdout.write("\nDone!\n")

    return df_out

def generate_all_nonempty_subsets(features):
    """
    Generates all nonempty subsets (combinations) of the given feature list.
    
    Parameters:
    - features (list): List of feature names.
    
    Returns:
    - list of tuples: All nonempty subsets.
    """
    all_subsets = []
    for k in range(1, len(features) + 1):
        combos = list(itertools.combinations(features, k))
        all_subsets.extend(combos)
    print(f"Generated {len(all_subsets)} total nonempty subsets.")
    return all_subsets

def compute_fraud_rate_differences(df, target_col='fraud', suffix='_present', variance_threshold=None):
    """
    Computes the difference in fraud rates between rows with feature == 1 and feature == 0
    for all columns ending with the given suffix.
    
    Parameters:
    - df (pd.DataFrame): Input DataFrame
    - target_col (str): Name of the target binary column (e.g., 'fraud')
    - suffix (str): Suffix to identify newly added features
    - filter_zero_variance (bool): Whether to exclude columns with zero variance
    
    Returns:
    - pd.DataFrame: Feature name, fraud rate at 0, fraud rate at 1, and the difference
    """
    results = []

    # Select columns with the specified suffix
    feature_cols = [col for col in df.columns if col.endswith(suffix)]

    if variance_threshold is not None:
        variances = df[feature_cols].var()
        feature_cols = [col for col in feature_cols if variances[col] > variance_threshold]
        print(f"Kept {len(feature_cols)} higher-variance features.")

    for col in feature_cols:
        grouped = df.groupby(col)[target_col].mean()
        rate_0 = grouped.get(0, None)
        rate_1 = grouped.get(1, None)
        if rate_0 is not None and rate_1 is not None:
            diff = rate_1 - rate_0
            var = df[col].var()
            results.append({
                'feature': col,
                'fraud_rate_at_0': rate_0,
                'fraud_rate_at_1': rate_1,
                'difference': diff,
                'variance': var
            })

    results_df = pd.DataFrame(results).sort_values(by='difference', key=abs, ascending=False)
    return results_df

In [None]:
additional_drops = ['vehicle_price_categories', 'zero_payout', 'log_pop_bin', 'age_group', 'past_num_of_claims', 'age_of_vehicle', 'claim_date.weekofyear', 'claim_date.day', 'claim_date.quarter', 'zipcode_type']

low_performing_col = train_df[filter_low_cardinality_columns(train_df, threshold=60)].drop(columns = additional_drops).columns.tolist()

# Step 2: Generate all 2- and 3-column combinations
combos = generate_all_nonempty_subsets(low_performing_col)

# Step 3: Add binary presence features
df_with_features = add_presence_features(train_df, combos)

print("New dataset shape:", df_with_features.shape)

In [None]:
df_with_features['fraud'] = target
fraud_diffs = compute_fraud_rate_differences(df_with_features, target_col='fraud', variance_threshold=0.1)
temp_1 = fraud_diffs[np.abs(fraud_diffs['difference']) > 0.03]

temp_1['info'] = (temp_1['variance']/0.25)*np.abs(temp_1['difference'])
temp_1 = temp_1.sort_values(by='info', ascending=False)
temp_1.to_csv('../../Create_Tune_Models/logs/subset_info_3.csv', index = False)

In [None]:
low_card_col = filter_low_cardinality_columns(train_df, threshold=120)

custom_cols = ['claim_date.weekofyear', 'claim_date.quarter', 'log_pop_bin', 'vehicle_price_categories', 'zero_payout']
low_card_col = [col for col in low_card_col if col not in custom_cols]


# Step 2: Generate all 2- and 3-column combinations
combos_2345 = generate_column_combinations(low_card_col, [2, 3, 4, 5])

# Step 3: Add binary presence features
df_with_features_2345 = add_presence_features(train_df, combos_2345)

print("New dataset shape:", df_with_features_2345.shape)

In [None]:
df_with_features_2345['fraud'] = target
fraud_diffs_2345 = compute_fraud_rate_differences(df_with_features_2345, target_col='fraud', variance_threshold=0.1)
temp_2 = fraud_diffs_2345[np.abs(fraud_diffs_2345['difference']) > 0.03]

temp_2['info'] = (temp_2['variance']/0.25)*np.abs(temp_2['difference'])
temp_2 = temp_2.sort_values(by='info', ascending=False)
temp_2.to_csv('../../Create_Tune_Models/logs/subset_info_2.csv', index = False)