In [0]:
%sql
with product_lookup as (
  -- Pre-process lookup table once for better performance
  select distinct 
    upper(source_sys_id) as source_sys_id,
    trim(upper(REPLACE(LTRIM(REPLACE(idb_plan_cd,'0',' ')),' ','0'))) as idb_plan_cd,
    trim(upper(REPLACE(LTRIM(REPLACE(idb_sub_plan_cd,'0',' ')),' ','0'))) as idb_sub_plan_cd,
    trim(stmt_plan_typ_txt) as Product, 
    sub_product_level_1, 
    sub_product_level_2
  from dl_tenants_daas.us_wealth_management.wealth_management_sub_product_group
),
base as (
  select 
    r.axa_party_id,
    r.policy_no,
    r.register_date,
    r.trmn_eff_date,
    r.wti_lob_txt,
    r.prod_lob,
    r.agt_class,
    r.isrd_brth_date,
    r.psn_age,
    r.acct_val_amt,
    r.face_amt,
    r.cash_val_amt,
    r.wc_total_assets,
    r.wc_assetmix_stocks,
    r.wc_assetmix_bonds,
    r.wc_assetmix_mutual_funds,
    r.wc_assetmix_annuity,
    r.wc_assetmix_deposits,
    r.wc_assetmix_other_assets,
    r.division_name,
    r.mkt_prod_hier,
    r.policy_status,
    r.agent_segment,
    r.channel,
    r.client_seg,
    r.client_seg_1,
    r.aum_band,
    r.business_month,
    r.branchoffice_code,
    r.agt_no,
    -- Pre-clean plan codes for better join performance
    trim(upper(REPLACE(LTRIM(REPLACE(r.plan_code,'0',' ')),' ','0'))) as cleaned_plan_code,
    trim(upper(REPLACE(LTRIM(REPLACE(r.plan_subcd_code,'0',' ')),' ','0'))) as cleaned_plan_subcd_code,
    h.sub_product_level_1,
    h.sub_product_level_2,
    h.Product,
    row_number() over (partition by r.axa_party_id order by r.register_date asc) as rn,
    count(*) over (partition by r.axa_party_id) as total_policies_per_client
  from dl_tenants_daas.us_wealth_management.wealth_management_client_metrics r
  left join product_lookup h 
    on upper(r.source_sys_id) = h.source_sys_id
    and trim(upper(REPLACE(LTRIM(REPLACE(r.plan_code,'0',' ')),' ','0'))) = h.idb_plan_cd
    and trim(upper(REPLACE(LTRIM(REPLACE(r.plan_subcd_code,'0',' ')),' ','0'))) = h.idb_sub_plan_cd
  where r.business_month = (select max(business_month) from dl_tenants_daas.us_wealth_management.wealth_management_client_metrics)
    and r.axa_party_id is not null
    and r.policy_no is not null
),
first_second as (
  select
    axa_party_id,
    max(total_policies_per_client) as total_policies_per_client,  -- NEW: Total policy count
    -- First policy fields
    max(case when rn = 1 then policy_no end) as policy_no,
    max(case when rn = 1 then register_date end) as register_date,
    max(case when rn = 1 then trmn_eff_date end) as trmn_eff_date,
    max(case when rn = 1 then wti_lob_txt end) as wti_lob_txt,
    max(case when rn = 1 then prod_lob end) as prod_lob,
    max(case when rn = 1 then agt_class end) as agt_class,
    max(case when rn = 1 then isrd_brth_date end) as isrd_brth_date,
    max(case when rn = 1 then psn_age end) as psn_age,
    max(case when rn = 1 then acct_val_amt end) as acct_val_amt,
    max(case when rn = 1 then face_amt end) as face_amt,
    max(case when rn = 1 then cash_val_amt end) as cash_val_amt,
    max(case when rn = 1 then wc_total_assets end) as wc_total_assets,
    max(case when rn = 1 then wc_assetmix_stocks end) as wc_assetmix_stocks,
    max(case when rn = 1 then wc_assetmix_bonds end) as wc_assetmix_bonds,
    max(case when rn = 1 then wc_assetmix_mutual_funds end) as wc_assetmix_mutual_funds,
    max(case when rn = 1 then wc_assetmix_annuity end) as wc_assetmix_annuity,
    max(case when rn = 1 then wc_assetmix_deposits end) as wc_assetmix_deposits,
    max(case when rn = 1 then wc_assetmix_other_assets end) as wc_assetmix_other_assets,
    max(case when rn = 1 then client_seg end) as client_seg,
    max(case when rn = 1 then client_seg_1 end) as client_seg_1,
    max(case when rn = 1 then aum_band end) as aum_band,
    max(case when rn = 1 then sub_product_level_1 end) as sub_product_level_1,
    max(case when rn = 1 then sub_product_level_2 end) as sub_product_level_2,
    max(case when rn = 1 then Product end) as Product,
    max(case when rn = 1 then business_month end) as business_month,
    max(case when rn = 1 then branchoffice_code end) as branchoffice_code,
    max(case when rn = 1 then agt_no end) as agt_no,
    max(case when rn = 1 then division_name end) as division_name,
    max(case when rn = 1 then mkt_prod_hier end) as mkt_prod_hier,
    max(case when rn = 1 then policy_status end) as policy_status ,
    max(case when rn = 1 then channel end) as channel,
    max(case when rn = 1 then agent_segment end) as agent_segment,
    -- Second policy fields
    max(case when rn = 2 then policy_no end) as second_policy_no,
    max(case when rn = 2 then register_date end) as second_register_date,
    max(case when rn = 2 then trmn_eff_date end) as second_trmn_eff_date,
    max(case when rn = 2 then wti_lob_txt end) as second_wti_lob_txt,
    max(case when rn = 2 then prod_lob end) as second_prod_lob,
    max(case when rn = 2 then sub_product_level_1 end) as second_sub_product_level_1,
    max(case when rn = 2 then sub_product_level_2 end) as second_sub_product_level_2,
    max(case when rn = 2 then Product end) as second_Product
  from base
  where rn <= 2
  group by axa_party_id
)
select *,
  -- NEW: Time-based features
  datediff(day, 
    max(case when rn = 1 then register_date end),
    max(case when rn = 2 then register_date end)
  ) as days_between_policies,
  CASE 
    WHEN datediff(day, 
          max(case when rn = 1 then register_date end),
          max(case when rn = 2 then register_date end)
        ) IS NULL THEN NULL
    WHEN datediff(day, 
          max(case when rn = 1 then register_date end),
          max(case when rn = 2 then register_date end)
        ) <= 90 THEN 'IMMEDIATE'
    WHEN datediff(day, 
          max(case when rn = 1 then register_date end),
          max(case when rn = 2 then register_date end)
        ) <= 365 THEN 'WITHIN_YEAR'
    WHEN datediff(day, 
          max(case when rn = 1 then register_date end),
          max(case when rn = 2 then register_date end)
        ) <= 1095 THEN 'WITHIN_3_YEARS'
    ELSE 'LONG_TERM'
  END AS cross_sell_timing_category,
  -- Existing ratio calculations
  wc_assetmix_stocks / NULLIF(wc_total_assets, 0) AS stock_allocation_ratio,
  wc_assetmix_bonds / NULLIF(wc_total_assets, 0) AS bond_allocation_ratio,
  wc_assetmix_annuity / NULLIF(wc_total_assets, 0) AS annuity_allocation_ratio,
  wc_assetmix_mutual_funds / NULLIF(wc_total_assets, 0) AS mutual_fund_allocation_ratio,
  acct_val_amt / NULLIF(wc_total_assets, 0) AS aum_to_asset_ratio,
  face_amt / NULLIF(wc_total_assets, 0) AS policy_value_to_assets_ratio,
  
  CASE 
    WHEN prod_lob = 'LIFE' THEN 'LIFE_INSURANCE'
    WHEN sub_product_level_1 IN ('VLI', 'WL', 'UL/IUL', 'TERM', 'PROTECTIVE PRODUCT') THEN 'LIFE_INSURANCE'
    WHEN sub_product_level_2 LIKE '%LIFE%' THEN 'LIFE_INSURANCE'
    WHEN sub_product_level_2 IN ('VARIABLE UNIVERSAL LIFE', 'WHOLE LIFE', 'UNIVERSAL LIFE', 
                                'INDEX UNIVERSAL LIFE', 'TERM PRODUCT', 'VARIABLE LIFE', 
                                'SURVIVORSHIP WHOLE LIFE', 'MONY PROTECTIVE PRODUCT') THEN 'LIFE_INSURANCE'
    WHEN prod_lob IN ('GROUP RETIREMENT', 'INDIVIDUAL RETIREMENT') THEN 'RETIREMENT'
    WHEN sub_product_level_1 IN ('EQUIVEST', 'RETIREMENT 401K', 'ACCUMULATOR', 
                                'RETIREMENT CORNERSTONE', 'SCS', 'INVESTMENT EDGE') THEN 'RETIREMENT'
    WHEN sub_product_level_2 LIKE '%403B%' OR sub_product_level_2 LIKE '%401%' 
         OR sub_product_level_2 LIKE '%IRA%' OR sub_product_level_2 LIKE '%SEP%' THEN 'RETIREMENT'
    WHEN Product LIKE '%IRA%' OR Product LIKE '%401%' OR Product LIKE '%403%' 
         OR Product LIKE '%SEP%' OR Product LIKE '%Accumulator%' 
         OR Product LIKE '%Retirement%' THEN 'RETIREMENT'
    WHEN prod_lob = 'BROKER DEALER' THEN 'INVESTMENT'
    WHEN sub_product_level_1 IN ('INVESTMENT PRODUCT - DIRECT', 'INVESTMENT PRODUCT - BROKERAGE', 
                                'INVESTMENT PRODUCT - ADVISORY', 'DIRECT', 'BROKERAGE', 
                                'ADVISORY', 'CASH SOLICITOR') THEN 'INVESTMENT'
    WHEN sub_product_level_2 LIKE '%Investment%' OR sub_product_level_2 LIKE '%Brokerage%' 
         OR sub_product_level_2 LIKE '%Advisory%' THEN 'INVESTMENT'
    WHEN prod_lob = 'NETWORK' THEN 'NETWORK_PRODUCTS'
    WHEN sub_product_level_1 = 'NETWORK PRODUCTS' OR sub_product_level_2 = 'NETWORK PRODUCTS' THEN 'NETWORK_PRODUCTS'
    WHEN Product LIKE '%Network%' THEN 'NETWORK_PRODUCTS'
    WHEN prod_lob = 'OTHERS' AND sub_product_level_1 = 'HAS' THEN 'DISABILITY'
    WHEN sub_product_level_2 = 'HAS - DISABILITY' THEN 'DISABILITY'
    WHEN Product LIKE '%Disability%' OR Product LIKE '%DI -%' THEN 'DISABILITY'
    WHEN prod_lob = 'OTHERS' THEN 'HEALTH'
    WHEN sub_product_level_2 = 'GROUP HEALTH PRODUCTS' THEN 'HEALTH'
    WHEN Product LIKE '%Health%' OR Product LIKE '%Medical%' OR Product LIKE '%Hospital%' THEN 'HEALTH'
    ELSE 'OTHER'
  END AS product_category,
  CASE 
    WHEN second_prod_lob IS NULL OR second_prod_lob = '' THEN NULL
    WHEN second_prod_lob = 'LIFE' THEN 'LIFE_INSURANCE'
    WHEN second_sub_product_level_1 IN ('VLI', 'WL', 'UL/IUL', 'TERM', 'PROTECTIVE PRODUCT') THEN 'LIFE_INSURANCE'
    WHEN second_sub_product_level_2 LIKE '%LIFE%' THEN 'LIFE_INSURANCE'
    WHEN second_sub_product_level_2 IN ('VARIABLE UNIVERSAL LIFE', 'WHOLE LIFE', 'UNIVERSAL LIFE', 
                                'INDEX UNIVERSAL LIFE', 'TERM PRODUCT', 'VARIABLE LIFE', 
                                'SURVIVORSHIP WHOLE LIFE', 'MONY PROTECTIVE PRODUCT') THEN 'LIFE_INSURANCE'
    WHEN second_prod_lob IN ('GROUP RETIREMENT', 'INDIVIDUAL RETIREMENT') THEN 'RETIREMENT'
    WHEN second_sub_product_level_1 IN ('EQUIVEST', 'RETIREMENT 401K', 'ACCUMULATOR', 
                                'RETIREMENT CORNERSTONE', 'SCS', 'INVESTMENT EDGE') THEN 'RETIREMENT'
    WHEN second_sub_product_level_2 LIKE '%403B%' OR second_sub_product_level_2 LIKE '%401%' 
         OR second_sub_product_level_2 LIKE '%IRA%' OR second_sub_product_level_2 LIKE '%SEP%' THEN 'RETIREMENT'
    WHEN second_Product LIKE '%IRA%' OR second_Product LIKE '%401%' OR second_Product LIKE '%403%' 
         OR second_Product LIKE '%SEP%' OR second_Product LIKE '%Accumulator%' 
         OR second_Product LIKE '%Retirement%' THEN 'RETIREMENT'
    WHEN second_prod_lob = 'BROKER DEALER' THEN 'INVESTMENT'
    WHEN second_sub_product_level_1 IN ('INVESTMENT PRODUCT - DIRECT', 'INVESTMENT PRODUCT - BROKERAGE', 
                                'INVESTMENT PRODUCT - ADVISORY', 'DIRECT', 'BROKERAGE', 
                                'ADVISORY', 'CASH SOLICITOR') THEN 'INVESTMENT'
    WHEN second_sub_product_level_2 LIKE '%Investment%' OR second_sub_product_level_2 LIKE '%Brokerage%' 
         OR second_sub_product_level_2 LIKE '%Advisory%' THEN 'INVESTMENT'
    WHEN second_prod_lob = 'NETWORK' THEN 'NETWORK_PRODUCTS'
    WHEN second_sub_product_level_1 = 'NETWORK PRODUCTS' OR second_sub_product_level_2 = 'NETWORK PRODUCTS' THEN 'NETWORK_PRODUCTS'
    WHEN second_Product LIKE '%Network%' THEN 'NETWORK_PRODUCTS'
    WHEN second_prod_lob = 'OTHERS' AND second_sub_product_level_1 = 'HAS' THEN 'DISABILITY'
    WHEN second_sub_product_level_2 = 'HAS - DISABILITY' THEN 'DISABILITY'
    WHEN second_Product LIKE '%Disability%' OR second_Product LIKE '%DI -%' THEN 'DISABILITY'
    WHEN second_prod_lob = 'OTHERS' THEN 'HEALTH'
    WHEN second_sub_product_level_2 = 'GROUP HEALTH PRODUCTS' THEN 'HEALTH'
    WHEN second_Product LIKE '%Health%' OR second_Product LIKE '%Medical%' OR second_Product LIKE '%Hospital%' THEN 'HEALTH'
    ELSE 'OTHER'
  END AS second_product_category,
  CASE
    WHEN MONTH(register_date) BETWEEN 1 AND 3 THEN 'Q1'
    WHEN MONTH(register_date) BETWEEN 4 AND 6 THEN 'Q2'
    WHEN MONTH(register_date) BETWEEN 7 AND 9 THEN 'Q3'
    WHEN MONTH(register_date) BETWEEN 10 AND 12 THEN 'Q4'
    ELSE 'Unknown'
  END AS season_of_first_policy
  
from first_second

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = _sqldf.toPandas()
# df = pd.read_csv('/Users/rajesh/Desktop/improve_metrics_JOB.csv')

# age at first policy (calculated from dates)
df['register_date'] = pd.to_datetime(df['register_date'], errors='coerce')
df['isrd_brth_date'] = pd.to_datetime(df['isrd_brth_date'], errors='coerce')
df['age_at_first_policy'] = (df['register_date'] - df['isrd_brth_date']).dt.days / 365.25

# age at second policy
df['second_register_date'] = pd.to_datetime(df['second_register_date'], errors='coerce')
df['age_at_second_policy'] = (df['second_register_date'] - df['isrd_brth_date']).dt.days / 365.25

# time gap between first and second policy
df['years_to_second'] = (df['second_register_date'] - df['register_date']).dt.days / 365.25
# Remove duplicate rows
df = df.drop_duplicates()
# Handle missing values

# drop rows with missing target or critical features
critical_cols = ['product_category']
df = df.dropna(subset=critical_cols)

num_cols = df.select_dtypes(include=['float64', 'int64']).columns
display(num_cols)

cat_cols = df.select_dtypes(include=['object']).columns
display(cat_cols)

import scipy.stats as stats
import numpy as np

# List of financial columns
financial_cols = [col for col in df.columns if col.startswith('wc_')] + ['face_amt', 'cash_val_amt', 'acct_val_amt']
financial_cols = [col for col in financial_cols if col in df.columns]

# Compute skewness for each financial column
skewness_dict = {col: stats.skew(df[col].dropna()) for col in financial_cols}
skew_df = pd.DataFrame([skewness_dict])
display(skew_df)

# Apply log1p transformation to reduce skewness
for col in financial_cols:
    df[f'log_{col}'] = np.log1p(df[col])

# Compute skewness for each log-transformed financial column
log_skewness_dict = {f'log_{col}': stats.skew(df[f'log_{col}'].dropna()) for col in financial_cols}
log_skew_df = pd.DataFrame([log_skewness_dict])
display(log_skew_df)

# Standardize date columns
date_cols = ['register_date', 'second_register_date', 'isrd_brth_date', 'trmn_eff_date', 'second_trmn_eff_date']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Remove outliers in numerical features
df = df[(df['age_at_first_policy'] >= 0) & (df['age_at_first_policy'] <= 100)]

# Categorical encoding (LabelEncoder is correct for tree models, but for Spark MLlib, use StringIndexer)
cat_cols = [
    'product_category', 'prod_lob', 'client_seg', 'aum_band', 'agt_class', 'season_of_first_policy', 'client_seg_1', 'division_name','mkt_prod_hier', 'policy_status', 'channel', 'agent_segment']
for col in cat_cols + ['second_product_category']:
    if col in df.columns:
        df[col] = df[col].astype(str)
        
# NOTE: Target encoding will be done AFTER train/test split to avoid data leakage
# This cell is just identifying which columns need target encoding
high_cardinality_cols = ['client_seg', 'client_seg_1', 'division_name', 'mkt_prod_hier']

high_cardinality_cols = [col for col in high_cardinality_cols if col in df.columns]
print(f"Columns identified for target encoding: {high_cardinality_cols}")
# Interaction features to capture non-linear relationships (with proper NaN/inf handling)
if {"age_at_first_policy", "wc_total_assets"}.issubset(df.columns):
    df["age_assets"] = (df["age_at_first_policy"] * df["wc_total_assets"]).replace([np.inf, -np.inf], np.nan)
else:
    df["age_assets"] = np.nan

if {"age_at_first_policy", "stock_allocation_ratio"}.issubset(df.columns):
    df["age_equity_ratio"] = (df["age_at_first_policy"] * df["stock_allocation_ratio"]).replace([np.inf, -np.inf], np.nan)
else:
    df["age_equity_ratio"] = np.nan

# NOTE: days_since_first_policy will be calculated AFTER train/test split to prevent data leakage
# This calculation will use only training data reference date
if "register_date" in df.columns:
    # Placeholder - will be recalculated after split
    df["days_since_first_policy"] = np.nan
    print("days_since_first_policy will be calculated after train/test split")
else:
    df["days_since_first_policy"] = np.nan

if "wc_total_assets" in df.columns:
    df["log_total_assets"] = np.log1p(df["wc_total_assets"].clip(lower=0))
    df["log_total_assets"] = df["log_total_assets"].replace([np.inf, -np.inf], np.nan)
else:
    df["log_total_assets"] = np.nan

if {"stock_allocation_ratio", "bond_allocation_ratio"}.issubset(df.columns):
    bond_ratio = df["bond_allocation_ratio"].replace(0, np.nan)
    df["equity_to_bond_ratio"] = (df["stock_allocation_ratio"] / bond_ratio).replace([np.inf, -np.inf], np.nan)
else:
    df["equity_to_bond_ratio"] = np.nan

# if {"premium_amount", "income_estimate"}.issubset(df.columns):
#     income = df["income_estimate"].replace(0, np.nan)
#     df["premium_to_income"] = (df["premium_amount"] / income).replace([np.inf, -np.inf], np.nan)
# else:
#     df["premium_to_income"] = np.nan


# NOTE: Clustering will be done AFTER train/test split to avoid data leakage
# This cell just identifies which features will be used for clustering
cluster_features = [
    "age_at_first_policy",
    "wc_total_assets",
    "stock_allocation_ratio",
    "bond_allocation_ratio",
    "annuity_allocation_ratio",
    "mutual_fund_allocation_ratio",
    "aum_to_asset_ratio",
    "policy_value_to_assets_ratio",
    "age_assets",
    "age_equity_ratio",
    "log_total_assets",
    "equity_to_bond_ratio",
    "days_since_first_policy"
]
available_cluster_features = [col for col in cluster_features if col in df.columns]
print(f"Features identified for clustering: {available_cluster_features}")

df = df.drop(columns=[
    'log_wc_assetmix_stocks',
    'log_wc_assetmix_bonds',
    'log_wc_assetmix_mutual_funds',
    'log_wc_assetmix_deposits',
    'log_wc_assetmix_other_assets',
    'log_acct_val_amt'
])


# Install required packages
!pip install catboost scikit-learn imbalanced-learn
# Prepare data for modeling - filter rows with second_product_category
print(f"Total rows: {len(df)}")
df_trainable = df[df['second_product_category'].notna()].copy()
print(f"Rows with second_product_category: {len(df_trainable)}")

# Remove invalid target classes (None, nan, etc.)
valid_targets = ['DISABILITY', 'HEALTH', 'INVESTMENT', 'LIFE_INSURANCE', 'NETWORK_PRODUCTS', 'RETIREMENT', 'OTHER']
df_trainable = df_trainable[df_trainable['second_product_category'].isin(valid_targets)].copy()
print(f"Rows after filtering valid targets: {len(df_trainable)}")

# ===== CLASS MERGING: Merge rare classes to handle severe imbalance =====
# Merge DISABILITY and HEALTH into OTHER_HEALTH (they're both very rare)
print("\n=== Merging Rare Classes ===")
print(f"Before merging - DISABILITY: {(df_trainable['second_product_category'] == 'DISABILITY').sum()}")
print(f"Before merging - HEALTH: {(df_trainable['second_product_category'] == 'HEALTH').sum()}")

df_trainable['second_product_category'] = df_trainable['second_product_category'].replace({
    'DISABILITY': 'OTHER_HEALTH',
    'HEALTH': 'OTHER_HEALTH'
})

print(f"After merging - OTHER_HEALTH: {(df_trainable['second_product_category'] == 'OTHER_HEALTH').sum()}")

# Update valid targets after merging
valid_targets = ['OTHER_HEALTH', 'INVESTMENT', 'LIFE_INSURANCE', 'NETWORK_PRODUCTS', 'RETIREMENT', 'OTHER']

# Check class distribution
print("\n=== Class Distribution (After Merging) ===")
class_dist = df_trainable['second_product_category'].value_counts()
print(class_dist)
print(f"\nClass percentages:\n{class_dist / len(df_trainable) * 100}")

# Drop rows with missing critical features
critical_cols = ['age_at_first_policy', 'years_to_second', 'product_category']
df_trainable = df_trainable.dropna(subset=critical_cols)
print(f"\nRows after dropping missing critical features: {len(df_trainable)}")

# NOTE: DO NOT create features using second_product_category (the target variable)
# This would cause data leakage. Features should only use information from the FIRST product.
# Examples of what NOT to do:
# - is_same_category = (product_category == second_product_category)  # LEAKAGE!
# - product_transition = product_category + '_TO_' + second_product_category  # LEAKAGE!

# ===== ADDITIONAL FEATURE ENGINEERING (NO LEAKAGE) =====
# Extract temporal features from FIRST policy register_date only
if 'register_date' in df_trainable.columns:
    df_trainable['register_date'] = pd.to_datetime(df_trainable['register_date'], errors='coerce')
    df_trainable['register_month'] = df_trainable['register_date'].dt.month
    df_trainable['register_quarter'] = df_trainable['register_date'].dt.quarter
    df_trainable['register_day_of_week'] = df_trainable['register_date'].dt.dayofweek
    df_trainable['register_year'] = df_trainable['register_date'].dt.year
    print("Temporal features added from register_date")

# Create age bands instead of raw age
if 'age_at_first_policy' in df_trainable.columns:
    df_trainable['age_band'] = pd.cut(
        df_trainable['age_at_first_policy'], 
        bins=[0, 30, 40, 50, 60, 70, 100],
        labels=['<30', '30-40', '40-50', '50-60', '60-70', '70+']
    )
    df_trainable['age_band'] = df_trainable['age_band'].astype(str)
    print("Age bands created")



In [0]:
# Train/Test split BEFORE any feature engineering to prevent data leakage
from sklearn.model_selection import train_test_split

# Check if we have enough samples per class for stratification
class_counts = df_trainable['second_product_category'].value_counts()
min_class_count = class_counts.min()
print(f"Minimum class count: {min_class_count}")

# Use stratification if all classes have at least 2 samples (for 20% test split)
if min_class_count >= 2:
    train_df, val_df = train_test_split(
        df_trainable, 
        test_size=0.2, 
        random_state=42, 
        stratify=df_trainable['second_product_category']
    )
else:
    # If some classes are too small, don't stratify
    train_df, val_df = train_test_split(
        df_trainable, 
        test_size=0.2, 
        random_state=42
    )
    print("Warning: Some classes too small for stratification, using random split")

print(f"Train size: {len(train_df)}, Val size: {len(val_df)}")
print(f"\nTrain class distribution:\n{train_df['second_product_category'].value_counts()}")
print(f"\nVal class distribution:\n{val_df['second_product_category'].value_counts()}")

# ===== FIX: Calculate days_since_first_policy from TRAIN data only =====
if "register_date" in train_df.columns:
    reference_date = train_df["register_date"].max()  # Use train only!
    train_df["days_since_first_policy"] = (reference_date - train_df["register_date"]).dt.days
    train_df["days_since_first_policy"] = train_df["days_since_first_policy"].replace([np.inf, -np.inf], np.nan)
    val_df["days_since_first_policy"] = (reference_date - val_df["register_date"]).dt.days
    val_df["days_since_first_policy"] = val_df["days_since_first_policy"].replace([np.inf, -np.inf], np.nan)
    print("days_since_first_policy calculated using train data reference date")

# Median imputation for allocation ratio columns (fit on train, apply to val)
from sklearn.impute import SimpleImputer

allocation_cols = [
    'stock_allocation_ratio', 'bond_allocation_ratio', 'annuity_allocation_ratio',
    'mutual_fund_allocation_ratio', 'aum_to_asset_ratio', 'policy_value_to_assets_ratio'
]
allocation_cols = [c for c in allocation_cols if c in train_df.columns]

if allocation_cols:
    imputer = SimpleImputer(strategy='median')
    train_df[allocation_cols] = imputer.fit_transform(train_df[allocation_cols])
    val_df[allocation_cols] = imputer.transform(val_df[allocation_cols])
    print(f"Imputed {len(allocation_cols)} allocation ratio columns")


# Create propensity features from TRAIN data only (to prevent leakage)
def create_propensity_features(train_data):
    """Create propensity features from training data"""
    # Product-level cross-sell popularity
    prod_counts = train_data.groupby('product_category').size().reset_index(name='p1_cross_sell_popularity')
    
    # Most common next product
    most_common = train_data.groupby(['product_category', 'second_product_category']).size().reset_index(name='count')
    most_common = most_common.sort_values('count', ascending=False).drop_duplicates('product_category')
    most_common = most_common[['product_category', 'second_product_category']].rename(
        columns={'second_product_category': 'p1_most_common_next_prod'}
    )
    
    # Agent-level counts
    agent_counts = train_data.groupby(['agt_no', 'product_category']).size().reset_index(name='agent_p1_cross_sell_count')
    
    # Branch-level counts
    branch_counts = train_data.groupby(['branchoffice_code', 'product_category']).size().reset_index(name='branch_p1_cross_sell_count')
    
    return prod_counts, most_common, agent_counts, branch_counts

def add_propensity_features(df, prod_counts, most_common, agent_counts, branch_counts):
    """Add propensity features to dataframe"""
    df = df.merge(prod_counts, on='product_category', how='left')
    df = df.merge(most_common, on='product_category', how='left')
    df = df.merge(agent_counts, on=['agt_no', 'product_category'], how='left')
    df = df.merge(branch_counts, on=['branchoffice_code', 'product_category'], how='left')
    
    # Fill missing values
    df['p1_cross_sell_popularity'] = df['p1_cross_sell_popularity'].fillna(0)
    df['agent_p1_cross_sell_count'] = df['agent_p1_cross_sell_count'].fillna(0)
    df['branch_p1_cross_sell_count'] = df['branch_p1_cross_sell_count'].fillna(0)
    df['p1_most_common_next_prod'] = df['p1_most_common_next_prod'].fillna('UNKNOWN')

    return df

# Create propensity features from TRAIN data only
prod_counts, most_common, agent_counts, branch_counts = create_propensity_features(train_df)

# Add propensity features to both train and val
train_df = add_propensity_features(train_df, prod_counts, most_common, agent_counts, branch_counts)
val_df = add_propensity_features(val_df, prod_counts, most_common, agent_counts, branch_counts)
print("Propensity features added")

# ===== ADD: Product category interaction features (from train only) =====
product_interactions = train_df.groupby('product_category').agg({
    'wc_total_assets': 'mean',
    'age_at_first_policy': 'mean',
    'stock_allocation_ratio': 'mean'
}).reset_index()
product_interactions.columns = ['product_category', 'avg_assets_by_product', 'avg_age_by_product', 'avg_stock_ratio_by_product']

train_df = train_df.merge(product_interactions, on='product_category', how='left')
val_df = val_df.merge(product_interactions, on='product_category', how='left')
print("Product category interaction features added")

# ===== ADD: Agent performance features (from train only) =====
agent_stats = train_df.groupby('agt_no').agg({
    'second_product_category': lambda x: x.value_counts().index[0] if len(x) > 0 else 'UNKNOWN',
    'wc_total_assets': 'mean'
}).reset_index()
agent_stats.columns = ['agt_no', 'agent_most_common_cross_sell', 'agent_avg_assets']
train_df = train_df.merge(agent_stats, on='agt_no', how='left')
val_df = val_df.merge(agent_stats, on='agt_no', how='left')
print("Agent performance features added")

# ===== ADD: Product × AUM interaction =====
train_df['product_aum_interaction'] = train_df['product_category'].astype(str) + '_' + train_df['aum_band'].astype(str)
val_df['product_aum_interaction'] = val_df['product_category'].astype(str) + '_' + val_df['aum_band'].astype(str)
print("Product × AUM interaction feature added")

# Target encoding with 5-fold regularization (ONLY on train data)
from sklearn.model_selection import KFold

def target_encode_with_kfold(train_data, val_data, cols_to_encode, target_col, smoothing=20):
    """Target encode high-cardinality categoricals using K-fold on train, apply to val"""
    train_encoded = train_data.copy()
    val_encoded = val_data.copy()
    
    # Get unique classes for multi-class encoding
    unique_classes = sorted(train_data[target_col].dropna().unique())
    
    for col in cols_to_encode:
        if col not in train_data.columns:
            continue
            
        # For each class, create a target encoding
        for class_val in unique_classes:
            te_col = f"te_{col}_{class_val}"
            train_encoded[te_col] = np.nan
            val_encoded[te_col] = np.nan
            
            # Create binary target for this class
            train_data_binary = (train_data[target_col] == class_val).astype(int)
            global_mean = train_data_binary.mean()
            
            # K-fold encoding on train
            kfold = KFold(n_splits=5, shuffle=True, random_state=42)
            valid_indices = train_data.index[train_data[target_col].notnull()].to_numpy()
            
            for train_idx, val_idx in kfold.split(valid_indices):
                train_fold_idx = valid_indices[train_idx]
                val_fold_idx = valid_indices[val_idx]
                
                fold_stats = train_data_binary.loc[train_fold_idx].groupby(train_data.loc[train_fold_idx, col]).agg(['mean', 'count'])
                fold_stats['smoothed'] = (
                    fold_stats['mean'] * fold_stats['count'] + global_mean * smoothing
                ) / (fold_stats['count'] + smoothing)
                mapping = fold_stats['smoothed']
                
                train_encoded.loc[val_fold_idx, te_col] = train_data.loc[val_fold_idx, col].map(mapping)
            
            # Fill remaining NaNs with global mean
            train_encoded[te_col] = train_encoded[te_col].fillna(global_mean)
            
            # Apply encoding to validation set using full train stats
            full_stats = train_data_binary.groupby(train_data[col]).agg(['mean', 'count'])
            full_stats['smoothed'] = (
                full_stats['mean'] * full_stats['count'] + global_mean * smoothing
            ) / (full_stats['count'] + smoothing)
            val_mapping = full_stats['smoothed']
            val_encoded[te_col] = val_data[col].map(val_mapping).fillna(global_mean)
    
    return train_encoded, val_encoded

# Apply target encoding with higher smoothing for imbalanced classes
# Use higher smoothing parameter to prevent overfitting on minority classes
high_cardinality_cols = [col for col in high_cardinality_cols if col in train_df.columns]
if high_cardinality_cols:
    # Increase smoothing from 20 to 50 for better handling of imbalanced data
    train_df, val_df = target_encode_with_kfold(
        train_df, val_df, high_cardinality_cols, 'second_product_category', smoothing=50
    )
    print(f"Target encoding applied to {len(high_cardinality_cols)} columns with smoothing=50")


# Customer clustering (fit on train, apply to val)
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

available_cluster_features = [col for col in available_cluster_features if col in train_df.columns]

if available_cluster_features:
    # Fill NaN with median for clustering
    cluster_train = train_df[available_cluster_features].fillna(train_df[available_cluster_features].median())
    cluster_val = val_df[available_cluster_features].fillna(train_df[available_cluster_features].median())
    
    # Standardize
    scaler = StandardScaler()
    cluster_train_scaled = scaler.fit_transform(cluster_train)
    cluster_val_scaled = scaler.transform(cluster_val)
    
    # Cluster
    kmeans = KMeans(n_clusters=8, random_state=42, n_init=10)
    train_df['client_cluster'] = kmeans.fit_predict(cluster_train_scaled).astype(str)
    val_df['client_cluster'] = kmeans.predict(cluster_val_scaled).astype(str)
    print("Clustering applied")
else:
    train_df['client_cluster'] = 'UNKNOWN'
    val_df['client_cluster'] = 'UNKNOWN'

# Compute per-class weights using balanced approach
# Use inverse frequency with stronger smoothing to prevent extreme weights
class_counts = train_df['second_product_category'].value_counts()
total_samples = len(train_df)
num_classes = len(class_counts)

# Calculate balanced weights but cap them to prevent extreme values
# Use square root scaling to reduce the impact of extreme imbalance
max_weight_cap = 10.0  # Cap maximum weight to prevent over-weighting minority classes
min_weight_floor = 0.5  # Floor minimum weight

class_weights_dict = {}
for class_name, count in class_counts.items():
    # Standard balanced weight
    balanced_weight = total_samples / (num_classes * count)
    # Apply square root scaling to reduce extreme weights
    sqrt_scaled_weight = np.sqrt(balanced_weight)
    # Cap the weight
    capped_weight = min(max(sqrt_scaled_weight, min_weight_floor), max_weight_cap)
    class_weights_dict[class_name] = capped_weight

print("Class weights (capped and sqrt-scaled):")
for k, v in sorted(class_weights_dict.items(), key=lambda x: x[1], reverse=True):
    print(f"  {k}: {v:.4f} (count: {class_counts[k]}, ratio: {class_counts[k]/total_samples:.4f})")

# Store for model, but don't use in Pool (use class_weights parameter instead)
print("\nClass weights computed (will be used in model, not as sample weights)")

# # Filter to only features that exist in train_df
feature_cols = [col for col in train_df.columns]

# # Separate categorical and numeric
categorical_feature_cols = [col for col in cat_cols if col in feature_cols] + ['p1_most_common_next_prod', 'client_cluster']


# print(f"Total features: {len(feature_cols)}")
# print(f"  - Categorical: {len(categorical_feature_cols)}")
# print(f"  - Numeric: {len(numeric_feature_cols)}")
# print(f"  - Target encoded: {len(target_encoded_cols)}")
cols_to_be_removed = [
 'branchoffice_code',
'agt_no',
'axa_party_id',
'policy_no',
'register_date',
'trmn_eff_date',
# 'isrd_brt_date',
'acct_val_amt',
 'face_amt',
 'cash_val_amt',
 'wc_total_assets',
 'wc_assetmix_stocks',
 'wc_assetmix_bonds',
 'wc_assetmix_mutual_funds',
 'wc_assetmix_annuity',
 'wc_assetmix_deposits',
 'wc_assetmix_other_assets',
  'business_month',
'Product',
'second_policy_no',
 'second_register_date',
 'second_trmn_eff_date',
 'second_wti_lob_txt',
 'second_prod_lob',
 'second_sub_product_level_1',
 'second_sub_product_level_2',
 'second_Product',
 'age_at_second_policy',
 'isrd_brth_date']

target_encoded_cols = [col for col in train_df.columns if col.startswith("te_")]
for col in target_encoded_cols:
  if col in train_df.columns:
    cols_to_be_removed.append(col)
# Only drop columns that exist in each DataFrame
train_df = train_df.drop(
    columns=[col for col in cols_to_be_removed if col in train_df.columns]
)
val_df = val_df.drop(
    columns=[col for col in cols_to_be_removed if col in val_df.columns]
)

display(train_df.shape)



categorical_feature_cols = [
    'wti_lob_txt', 'prod_lob', 'agt_class', 'client_seg', 'client_seg_1', 'aum_band',
    'sub_product_level_1', 'sub_product_level_2', 'division_name', 'mkt_prod_hier',
    'policy_status', 'channel', 'agent_segment', 'product_category',
    'season_of_first_policy', 'p1_most_common_next_prod', 'client_cluster',
    'age_band', 'register_month', 'register_quarter', 'register_day_of_week',
    'cross_sell_timing_category', 'agent_most_common_cross_sell', 'product_aum_interaction'
]
# Filter to only columns that exist
categorical_feature_cols = [col for col in categorical_feature_cols if col in train_df.columns]

numerical_feature_cols = [
    'psn_age', 'stock_allocation_ratio', 'bond_allocation_ratio', 'annuity_allocation_ratio',
    'mutual_fund_allocation_ratio', 'aum_to_asset_ratio', 'policy_value_to_assets_ratio',
    'age_at_first_policy', 'years_to_second', 'log_wc_total_assets', 'log_wc_assetmix_annuity',
    'log_face_amt', 'log_cash_val_amt', 'age_assets', 'age_equity_ratio', 'days_since_first_policy',
    'log_total_assets', 'equity_to_bond_ratio', 'p1_cross_sell_popularity',
    'agent_p1_cross_sell_count', 'branch_p1_cross_sell_count',
    'total_policies_per_client', 'days_between_policies',  # NEW: From SQL
    'avg_assets_by_product', 'avg_age_by_product', 'avg_stock_ratio_by_product',  # NEW: Product interactions
    'agent_avg_assets'  # NEW: Agent stats
]

print("Categorical columns:", categorical_feature_cols)
print("Numerical columns:", numerical_feature_cols)

# ===== IMPROVED MISSING VALUE HANDLING =====
# Add missing value indicators before imputation
missing_indicator_cols = []
for col in numerical_feature_cols:
    if col in train_df.columns:
        # Create missing indicator
        train_df[f'{col}_is_missing'] = train_df[col].isna().astype(int)
        val_df[f'{col}_is_missing'] = val_df[col].isna().astype(int)
        missing_indicator_cols.append(f'{col}_is_missing')
        
        # Then impute with median
        median_val = train_df[col].median()
        train_df[col] = train_df[col].fillna(median_val)
        val_df[col] = val_df[col].fillna(median_val)

# Add missing indicators to numerical features list
numerical_feature_cols.extend(missing_indicator_cols)

# Fill NaN in categorical features with mode
for col in categorical_feature_cols:
    if col in train_df.columns:
        mode_val = train_df[col].mode()[0] if len(train_df[col].mode()) > 0 else 'UNKNOWN'
        train_df[col] = train_df[col].fillna(mode_val)
        val_df[col] = val_df[col].fillna(mode_val)

print(f"Missing values handled. Added {len(missing_indicator_cols)} missing indicators.")
# Exclude target and weight columns from features
pool_columns = [col for col in train_df.columns if col not in ['second_product_category', 'class_weight']]

# ===== ACTIVATE SMOTE for minority classes =====
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import LabelEncoder
import pandas as pd

print("\n=== Before SMOTE ===")
print(f"Train class distribution:\n{train_df['second_product_category'].value_counts()}")
print(f"Train size: {len(train_df)}")

# Calculate target sample sizes for SMOTE
# Upsample minority classes to at least 10% of majority class size
majority_class_size = train_df['second_product_category'].value_counts().max()
target_sizes = {}
for class_name, count in train_df['second_product_category'].value_counts().items():
    if count < majority_class_size * 0.1:  # If less than 10% of majority
        target_sizes[class_name] = int(majority_class_size * 0.15)  # Upsample to 15% of majority
        print(f"  {class_name}: {count} -> {target_sizes[class_name]} (upsampling)")

if target_sizes:
    # Store original train_df before SMOTE (for reference)
    train_df_original = train_df.copy()
    
    # Prepare data for SMOTE
    X_train_smote = train_df[pool_columns].copy()
    y_train_smote = train_df['second_product_category'].copy()
    
    # Store original dtypes and categorical mappings
    cat_encoders = {}
    X_train_smote_encoded = X_train_smote.copy()
    
    # Separate numerical and categorical columns
    numerical_cols_smote = [col for col in pool_columns if col in numerical_feature_cols]
    categorical_cols_smote = [col for col in pool_columns if col in categorical_feature_cols]
    
    # Encode only categoricals for SMOTE (numericals stay as-is)
    for col in categorical_cols_smote:
        if col in X_train_smote_encoded.columns:
            le = LabelEncoder()
            # Handle NaN values
            mask = X_train_smote_encoded[col].isna()
            X_train_smote_encoded[col] = X_train_smote_encoded[col].astype(str)
            X_train_smote_encoded.loc[mask, col] = 'UNKNOWN'
            X_train_smote_encoded[col] = le.fit_transform(X_train_smote_encoded[col])
            cat_encoders[col] = le
    
    # Apply SMOTE
    # Calculate safe k_neighbors (must be less than smallest class size)
    min_class_size = min(train_df['second_product_category'].value_counts().values)
    safe_k_neighbors = min(3, min_class_size - 1) if min_class_size > 1 else 1
    
    smote = SMOTE(
        sampling_strategy=target_sizes,
        random_state=42,
        k_neighbors=safe_k_neighbors
    )
    
    print("\nApplying SMOTE...")
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_smote_encoded, y_train_smote)
    
    print("\n=== After SMOTE ===")
    print(f"Train class distribution:\n{Counter(y_train_resampled)}")
    print(f"Original train size: {len(train_df)}, After SMOTE: {len(X_train_resampled)}")
    
    # Reconstruct train_df from resampled data
    train_df_resampled = pd.DataFrame(X_train_resampled, columns=pool_columns)
    
    # Decode categoricals back to original values
    for col in categorical_cols_smote:
        if col in train_df_resampled.columns and col in cat_encoders:
            # Round to nearest integer (SMOTE may produce floats)
            train_df_resampled[col] = train_df_resampled[col].round().astype(int)
            # Clip to valid range
            valid_range = range(len(cat_encoders[col].classes_))
            train_df_resampled[col] = train_df_resampled[col].clip(min(valid_range), max(valid_range))
            train_df_resampled[col] = cat_encoders[col].inverse_transform(train_df_resampled[col])
            # Keep UNKNOWN as is (it's a valid category)
    
    # Add target
    train_df_resampled['second_product_category'] = y_train_resampled
    
    # Replace train_df with resampled version
    train_df = train_df_resampled.copy()
    print("SMOTE applied successfully - train_df updated with resampled data")
    
    # Recalculate class weights after SMOTE
    class_counts = train_df['second_product_category'].value_counts()
    total_samples = len(train_df)
    num_classes = len(class_counts)
    
    class_weights_dict = {}
    for class_name, count in class_counts.items():
        balanced_weight = total_samples / (num_classes * count)
        sqrt_scaled_weight = np.sqrt(balanced_weight)
        capped_weight = min(max(sqrt_scaled_weight, min_weight_floor), max_weight_cap)
        class_weights_dict[class_name] = capped_weight
    
    print("\nClass weights recalculated after SMOTE:")
    for k, v in sorted(class_weights_dict.items(), key=lambda x: x[1], reverse=True):
        print(f"  {k}: {v:.4f} (count: {class_counts[k]})")
else:
    print("No classes need SMOTE upsampling - all classes are sufficiently represented")


In [None]:
# Additional recommendations for improving F1 score to >80%:
# 
# 1. **SMOTE/ADASYN for minority classes**: Consider using SMOTE to oversample minority classes
#    (DISABILITY, HEALTH) if they remain problematic after current improvements
#
# 2. **Ensemble methods**: Combine multiple models (CatBoost + XGBoost + LightGBM) for better performance
#
# 3. **Feature engineering (NO LEAKAGE)**: 
#    - Add temporal features from FIRST policy only: month, day of week, quarter
#    - Add interaction features: product_category * age, product_category * total_assets
#    - Add aggregated features from FIRST product: average assets by product_category (from train only)
#    - Add client behavior features: time since first policy, age bands, asset allocation patterns
#    - NEVER use second_product_category in feature creation (data leakage!)
#
# 4. **Hyperparameter tuning**: Use Optuna or GridSearchCV to find optimal hyperparameters
#
# 5. **Threshold tuning**: For each class, find optimal probability threshold instead of using default 0.5
#
# 6. **Cost-sensitive learning**: Adjust class weights based on business value of each prediction
#
# 7. **Remove or merge rare classes**: Consider merging DISABILITY and HEALTH into a single "OTHER_HEALTH" class
#
# 8. **More data**: If possible, collect more samples for minority classes
#
# Current improvements implemented:
# - Removed "None" class
# - CLASS MERGING: Merged DISABILITY + HEALTH → OTHER_HEALTH (active)
# - SMOTE OVERSAMPLING: Activated for minority classes (active)
# - Better class weight calculation with smoothing (sqrt-scaled, capped)
# - Removed data leakage features (product_transition, is_same_category)
# - Improved target encoding smoothing (smoothing=50)
# - Optimized CatBoost hyperparameters for imbalanced data
# - Added feature importance and confusion matrix analysis
# - Added temporal features (month, quarter, day_of_week from register_date)
# - Added age bands (categorical age groups)
# - Added missing value indicators (before imputation)
# - Removed max_leaves parameter (incompatible with SymmetricTree)
# - Added ensemble model code (commented, ready to use)
# - SQL Query improvements: pre-cleaned lookup, total policy count, time-based features
# - Fixed data leakage: days_since_first_policy now uses train data only
# - Added product category interaction features
# - Added agent performance features
# - Added product × AUM interaction features
# - Added threshold tuning code (commented, ready to use)


In [None]:
# ===== ENSEMBLE MODELS (OPTIONAL - UNCOMMENT TO USE) =====
# Ensemble can improve performance by 5-10% over single model
# Uncomment below to train ensemble of CatBoost + XGBoost + LightGBM

"""
# Install additional libraries
!pip install xgboost lightgbm

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder

# Prepare data for XGBoost and LightGBM (need label encoding for categoricals)
le = LabelEncoder()
y_train_encoded = le.fit_transform(train_df['second_product_category'])
y_val_encoded = le.transform(val_df['second_product_category'])

# Get feature columns (exclude target)
X_train = train_df[pool_columns].copy()
X_val = val_df[pool_columns].copy()

# Encode categorical features for XGBoost/LightGBM
cat_indices = [i for i, col in enumerate(pool_columns) if col in categorical_feature_cols]

# 1. XGBoost
xgb_model = XGBClassifier(
    n_estimators=1000,
    max_depth=7,
    learning_rate=0.05,
    objective='multi:softprob',
    eval_metric='mlogloss',
    random_state=42,
    early_stopping_rounds=100,
    use_label_encoder=False
)

# 2. LightGBM
lgb_model = LGBMClassifier(
    n_estimators=1000,
    max_depth=7,
    learning_rate=0.05,
    objective='multiclass',
    random_state=42,
    verbose=-1,
    early_stopping_rounds=100
)

# Train individual models
print("Training XGBoost...")
xgb_model.fit(
    X_train, y_train_encoded,
    eval_set=[(X_val, y_val_encoded)],
    verbose=False
)

print("Training LightGBM...")
lgb_model.fit(
    X_train, y_train_encoded,
    eval_set=[(X_val, y_val_encoded)],
    categorical_feature=cat_indices
)

# Create ensemble
ensemble = VotingClassifier(
    estimators=[
        ('catboost', cat_model_regularized),
        ('xgboost', xgb_model),
        ('lightgbm', lgb_model)
    ],
    voting='soft',  # Use probability voting
    weights=[2, 1, 1]  # Weight CatBoost more (handles categoricals better)
)

# Evaluate ensemble
ensemble_pred = ensemble.predict(X_val)
ensemble_pred_decoded = le.inverse_transform(ensemble_pred)

f1_macro_ensemble = f1_score(val_df['second_product_category'], ensemble_pred_decoded, average='macro')
print(f"\n=== Ensemble Performance ===")
print(f"Ensemble Macro F1: {f1_macro_ensemble:.4f}")
print(f"Single CatBoost Macro F1: {f1_macro:.4f}")
print(f"Improvement: {f1_macro_ensemble - f1_macro:.4f}")
"""


In [None]:
# ===== OPTIONAL: Per-class threshold tuning =====
# Uncomment to find optimal probability thresholds for each class
# This can improve Macro F1 by 5-15% for imbalanced datasets

"""
from sklearn.metrics import f1_score
import numpy as np

# Find optimal thresholds for each class
def find_optimal_thresholds(y_true, y_proba, classes):
    optimal_thresholds = {}
    for i, class_name in enumerate(classes):
        y_binary = (y_true == class_name).astype(int)
        proba_class = y_proba[:, i]
        
        best_threshold = 0.5
        best_f1 = 0
        
        for threshold in np.arange(0.1, 0.9, 0.05):
            y_pred_binary = (proba_class >= threshold).astype(int)
            f1 = f1_score(y_binary, y_pred_binary, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
        
        optimal_thresholds[class_name] = best_threshold
        print(f"{class_name}: optimal threshold = {best_threshold:.3f}, F1 = {best_f1:.3f}")
    
    return optimal_thresholds

# Get class order from model
classes = cat_model_regularized.classes_
optimal_thresholds = find_optimal_thresholds(
    val_df['second_product_category'], 
    val_probabilities, 
    classes
)

# Apply optimal thresholds
val_predictions_tuned = []
for i in range(len(val_probabilities)):
    probs = val_probabilities[i]
    # Find class with highest probability above its threshold
    best_class_idx = -1
    best_score = -1
    for j, class_name in enumerate(classes):
        threshold = optimal_thresholds[class_name]
        if probs[j] >= threshold and probs[j] > best_score:
            best_score = probs[j]
            best_class_idx = j
    
    if best_class_idx == -1:
        # If no class meets threshold, use default prediction
        best_class_idx = np.argmax(probs)
    
    val_predictions_tuned.append(classes[best_class_idx])

val_predictions_tuned = np.array(val_predictions_tuned)
f1_macro_tuned = f1_score(val_df['second_product_category'], val_predictions_tuned, average='macro')
print(f"\n=== Threshold Tuning Results ===")
print(f"Original Macro F1: {f1_macro:.4f}")
print(f"Tuned Macro F1: {f1_macro_tuned:.4f}")
print(f"Improvement: {f1_macro_tuned - f1_macro:.4f}")
"""


In [0]:
%pip install catboost

from catboost import CatBoostClassifier
from catboost import Pool
# Ensure categorical features don't include target or weight
cat_feature_names = [col for col in categorical_feature_cols if col in pool_columns]


# Create pools WITHOUT sample weights (using class_weights in model instead)
# Using both weight in Pool AND class_weights in model causes double-weighting
train_pool = Pool(
    data=train_df[pool_columns],
    label=train_df["second_product_category"],
    cat_features=cat_feature_names,
)

val_pool = Pool(
    data=val_df[pool_columns],
    label=val_df["second_product_category"],
    cat_features=cat_feature_names,
)

from catboost import CatBoostClassifier
# Optimized CatBoost for imbalanced multi-class classification
# Using balanced class weights with capped values to prevent over-weighting
cat_model_regularized = CatBoostClassifier(
    iterations=2000,  # Increased iterations for better learning
    depth=7,  # Moderate depth to prevent overfitting
    learning_rate=0.05,  # Balanced learning rate
    loss_function="MultiClass",  # Multi-class classification
    eval_metric="TotalF1",  # Focus on F1 score
    random_seed=42,
    task_type="CPU",
    l2_leaf_reg=3.0,  # Moderate regularization
    subsample=0.85,  # Use more data per tree
    colsample_bylevel=0.85,  # Use more features per level
    min_data_in_leaf=50,  # Higher minimum samples to prevent overfitting on minority classes
    early_stopping_rounds=100,  # Stop if no improvement
    # Note: max_leaves only works with Lossguide, so removed for SymmetricTree
    verbose=100,
    bootstrap_type="Bernoulli",
    class_weights=class_weights_dict,  # Capped class weights to prevent extreme weighting
    grow_policy="SymmetricTree",  # More stable than Lossguide for imbalanced data
    boosting_type="Plain",  # Standard boosting
    use_best_model=True
)

cat_model_regularized.fit(
    train_pool, 
    eval_set=val_pool,
    use_best_model=True  # Use best model based on validation
)

print("Regularized CatBoost model trained")

# Evaluate CatBoost model
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score

val_predictions = cat_model_regularized.predict(val_pool)
val_probabilities = cat_model_regularized.predict_proba(val_pool)

f1_macro = f1_score(val_df['second_product_category'], val_predictions, average='macro')
f1_weighted = f1_score(val_df['second_product_category'], val_predictions, average='weighted')
accuracy = accuracy_score(val_df['second_product_category'], val_predictions)
precision = precision_score(val_df['second_product_category'], val_predictions, average='macro')
recall = recall_score(val_df['second_product_category'], val_predictions, average='macro')

print(f"\n=== Model Performance ===")
print(f"Macro F1: {f1_macro:.4f}")
print(f"Weighted F1: {f1_weighted:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Macro Precision: {precision:.4f}")
print(f"Macro Recall: {recall:.4f}")
print("\n=== Classification Report ===")
print(classification_report(val_df['second_product_category'], val_predictions))

# Feature importance analysis
print("\n=== Top 20 Feature Importances ===")
feature_importance = cat_model_regularized.get_feature_importance()
feature_names = pool_columns
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)
print(importance_df.head(20))

# Per-class performance analysis
print("\n=== Per-Class Performance Analysis ===")
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(val_df['second_product_category'], val_predictions)
classes = sorted(val_df['second_product_category'].unique())
cm_df = pd.DataFrame(cm, index=classes, columns=classes)
print("\nConfusion Matrix:")
print(cm_df)

# Analyze prediction distribution
print("\n=== Prediction Distribution ===")
pred_dist = pd.Series(val_predictions.flatten()).value_counts().sort_index()
true_dist = val_df['second_product_category'].value_counts().sort_index()
comparison_df = pd.DataFrame({
    'True Count': true_dist,
    'Predicted Count': pred_dist,
    'Difference': pred_dist - true_dist,
    'Difference %': ((pred_dist - true_dist) / true_dist * 100).round(2)
})
print(comparison_df)

# If model is still over-predicting minority classes, consider:
# 1. Further reducing class weights (lower max_weight_cap)
# 2. Using SMOTE for oversampling
# 3. Merging very rare classes (DISABILITY + HEALTH)
# 4. Using cost-sensitive threshold tuning per class
# Display predictions with probabilities
predictions_df = val_df[['product_category', 'second_product_category']].copy()
predictions_df['predicted_second_product_category'] = val_predictions
predictions_df['max_probability'] = val_probabilities.max(axis=1)
predictions_df['prediction_confidence'] = predictions_df['max_probability'].apply(
    lambda x: 'High' if x > 0.7 else 'Medium' if x > 0.5 else 'Low'
)

display(predictions_df.head(20))


Index(['psn_age', 'wc_total_assets', 'wc_assetmix_stocks', 'wc_assetmix_bonds',
       'wc_assetmix_mutual_funds', 'wc_assetmix_annuity',
       'wc_assetmix_deposits', 'wc_assetmix_other_assets',
       'stock_allocation_ratio', 'bond_allocation_ratio',
       'annuity_allocation_ratio', 'mutual_fund_allocation_ratio',
       'aum_to_asset_ratio', 'policy_value_to_assets_ratio',
       'age_at_first_policy', 'age_at_second_policy', 'years_to_second'],
      dtype='object')

Index(['axa_party_id', 'policy_no', 'trmn_eff_date', 'wti_lob_txt', 'prod_lob',
       'agt_class', 'client_seg', 'client_seg_1', 'aum_band',
       'sub_product_level_1', 'sub_product_level_2', 'Product',
       'branchoffice_code', 'agt_no', 'division_name', 'mkt_prod_hier',
       'policy_status', 'channel', 'agent_segment', 'second_policy_no',
       'second_trmn_eff_date', 'second_wti_lob_txt', 'second_prod_lob',
       'second_sub_product_level_1', 'second_sub_product_level_2',
       'second_Product', 'product_category', 'second_product_category',
       'season_of_first_policy'],
      dtype='object')