In [26]:
import pandas as pd

pd.set_option('display.max_columns',None)
pd.set_option('display.max_colwidth',None)
pd.set_option('display.max_rows',None)

import  warnings
warnings.filterwarnings('ignore')

In [None]:

df_claim = pd.read_parquet("claims_data.parquet")

In [22]:
df_claim.shape

(1105696, 32)

In [5]:
df_claim.columns

Index(['Client Name', 'Client Identifier', 'Scheme Category/ Section Name',
       'Scheme Category/ Section Name Identifier', 'Status of Member',
       'Claimant Unique ID', 'Claimant Year of Birth', 'Claimant Gender',
       'Short Post Code', 'Unique Member Reference', 'Contract Start Date',
       'Contract End Date', 'Provider Type', 'Claim_Year', 'Claim_Month',
       'Claim ID', 'Incurred Date', 'Paid Date', 'Condition Code',
       'Impairment Code', 'Condition Category', 'Treatment Type', 'Claim Type',
       'Ancillary Service Type', 'Treatment Location', 'Admission Date',
       'Discharge Date', 'Calculate Length of Service', 'Claim Amount',
       'Amount Paid', 'year', 'month'],
      dtype='object')

In [None]:
df_claim.sample(5)

Unnamed: 0,Client Name,Client Identifier,Scheme Category/ Section Name,Scheme Category/ Section Name Identifier,Status of Member,Claimant Unique ID,Claimant Year of Birth,Claimant Gender,Short Post Code,Unique Member Reference,...,Claim Type,Ancillary Service Type,Treatment Location,Admission Date,Discharge Date,Calculate Length of Service,Claim Amount,Amount Paid,year,month
692150,Westfield Health,CLI0013,Gold Plan,SCH0007,Dependent,MEM00033005-03,2005,Female,EC4,MEM00033005,...,,,,,,,0.0,,,
978179,Westfield Health,CLI0013,Premier Cover,SCH0003,Partner,MEM00046761-02,1987,Male,E14,MEM00046761,...,,,,,,,0.0,,,
191703,Aviva Health,CLI0004,Standard Plan,SCH0005,Partner,MEM00008882-02,1971,Female,CB4,MEM00008882,...,Inpatient,Diagnostic Imaging,Spire Manchester Hospital,2023-11-02,2023-11-25,23.0,73023.12,71400.79,2023.0,11.0
206210,WPA Healthcare,CLI0005,Standard Plan,SCH0005,Member,MEM00009554-01,1965,Male,G2,MEM00009554,...,,,,,,,0.0,,,
819118,Healix Health Services,CLI0009,Complete Care,SCH0010,Partner,MEM00039109-02,1983,Female,BR1,MEM00039109,...,,,,,,,0.0,,,


In [14]:
import numpy as np
from scipy.stats import skew, entropy

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import skew, entropy


monthly_df = df_claim.copy()

required_cols = ["Claimant Unique ID", "Claim_Year", "Claim_Month", "Claim Amount"]
missing = [c for c in required_cols if c not in monthly_df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")


monthly_df["Claim Amount"] = monthly_df["Claim Amount"].fillna(0)
monthly_df["Claim_Month"] = monthly_df["Claim_Month"].fillna(0).astype(int)


monthly_df["Quarter"] = ((monthly_df["Claim_Month"] - 1) // 3) + 1

# --- 4️⃣ Helper: Function to compute yearly features from monthly data ---
def yearly_features(g):
    x = g["Claim Amount"].values
    probs = np.abs(x) / (x.sum() + 1e-6)
    qsum = g.groupby("Quarter")["Claim Amount"].sum().to_dict()

    return pd.Series({
        "total_claim_amount": x.sum(),
        "mean_monthly_amount": x.mean(),
        "std_monthly_amount": x.std(),
        "coef_var": x.std() / (x.mean() + 1e-6),
        "skew_monthly_amount": skew(x),
        "claim_month_entropy": entropy(probs),
        "months_with_claims": np.sum(x > 0),
        "max_monthly_amount": x.max(),
        "min_monthly_amount": x.min(),
        "Q1_amount": qsum.get(1, 0),
        "Q2_amount": qsum.get(2, 0),
        "Q3_amount": qsum.get(3, 0),
        "Q4_amount": qsum.get(4, 0)
    })


yearly = (
    monthly_df
    .groupby(["Claimant Unique ID", "Claim_Year"])
    .apply(yearly_features)
    .reset_index()
)


yearly = yearly.sort_values(["Claimant Unique ID", "Claim_Year"]).reset_index(drop=True)
yearly["lag1_total"] = yearly.groupby("Claimant Unique ID")["total_claim_amount"].shift(1)
yearly["yoy_change"] = (
    (yearly["total_claim_amount"] - yearly["lag1_total"]) / (yearly["lag1_total"] + 1e-6)
)
yearly["lag1_total"].fillna(0, inplace=True)
yearly["yoy_change"].fillna(0, inplace=True)

# ---Helper function to make share tables for categorical variables ---
def make_share_table(df, feature):
    if feature not in df.columns:
        return None
    pivot = (
        df.groupby(["Claimant Unique ID", "Claim_Year", feature])["Claim Amount"]
        .sum()
        .unstack(fill_value=0)
    )
    pivot = pivot.div(pivot.sum(axis=1) + 1e-6, axis=0)
    pivot.columns = [f"{feature.lower().replace(' ', '_')}_share_{c}" for c in pivot.columns]
    return pivot.reset_index()

# ---Generate share features for important categorical columns ---
cat_features = [
    "Condition Category",
    "Treatment Type",
    "Ancillary Service type",
    "Treatment Location"
]

share_tables = []
for feat in cat_features:
    share_tbl = make_share_table(monthly_df, feat)
    if share_tbl is not None:
        share_tables.append(share_tbl)

# --Merge all share tables into yearly dataset ---
for share_tbl in share_tables:
    yearly = yearly.merge(share_tbl, on=["Claimant Unique ID", "Claim_Year"], how="left")

# Fill missing proportions with 0
share_cols = [c for c in yearly.columns if "_share_" in c]
yearly[share_cols] = yearly[share_cols].fillna(0)

# --Add static member features (if available) ---
static_cols = [
    "Client Name", "Client Identifier", "Claimant Gender", 
    "Claimant Year of Birth", "Scheme Category/ Section Name", 
    "Provider Type"
]
static_cols = [c for c in static_cols if c in monthly_df.columns]

if static_cols:
    static_features = (
        monthly_df.groupby("Claimant Unique ID")[static_cols]
        .agg(lambda x: x.dropna().mode().iloc[0] if len(x.dropna()) else np.nan)
        .reset_index()
    )
    yearly = yearly.merge(static_features, on="Claimant Unique ID", how="left")

# ---Compute dynamic age if birth year available ---
if "Claimant Year of Birth" in yearly.columns:
    yearly["age"] = yearly["Claim_Year"] - yearly["Claimant Year of Birth"]

# ---Optional: Next-year target for prediction ---
yearly["target_next_year"] = yearly.groupby("Claimant Unique ID")["total_claim_amount"].shift(-1)
yearly["is_high_cost_next_year"] = (
    yearly["target_next_year"] > yearly["target_next_year"].quantile(0.95)
).astype(int)

# --- ✅ DONE ---
print("✅ Yearly feature-rich dataset created successfully.")
print("Shape:", yearly.shape)
print("Columns:", len(yearly.columns))
print(yearly.head(10))
 


  .apply(yearly_features)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  yearly["lag1_total"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  yearly["yoy_change"].fillna(0, inplace=True)


✅ Yearly feature-rich dataset created successfully.
Shape: (83961, 75)
Columns: 75
  Claimant Unique ID  Claim_Year  total_claim_amount  mean_monthly_amount  \
0     MEM00000001-02        2020                0.00             0.000000   
1     MEM00000001-02        2021             1014.43            84.535833   
2     MEM00000003-02        2024           189387.29         15782.274167   
3     MEM00000003-02        2025             1921.29           160.107500   
4     MEM00000005-02        2022              270.13            22.510833   
5     MEM00000005-02        2023                0.00             0.000000   
6     MEM00000006-01        2020            13958.46          1163.205000   
7     MEM00000006-01        2021            21932.03          1687.079231   
8     MEM00000007-01        2024             2932.09           244.340833   
9     MEM00000007-01        2025             2155.12           179.593333   

   std_monthly_amount  coef_var  skew_monthly_amount  claim_month_ent

In [29]:
yearly.columns

Index(['Claimant Unique ID', 'Claim_Year', 'total_claim_amount',
       'mean_monthly_amount', 'std_monthly_amount', 'coef_var',
       'skew_monthly_amount', 'claim_month_entropy', 'months_with_claims',
       'max_monthly_amount', 'min_monthly_amount', 'Q1_amount', 'Q2_amount',
       'Q3_amount', 'Q4_amount', 'lag1_total', 'yoy_change',
       'condition_category_share_Cardiovascular',
       'condition_category_share_Endocrine',
       'condition_category_share_Gastroenterology',
       'condition_category_share_General',
       'condition_category_share_General Surgery',
       'condition_category_share_Mental Health',
       'condition_category_share_Musculoskeletal',
       'condition_category_share_Oncology',
       'condition_category_share_Ophthalmology',
       'condition_category_share_Preventive',
       'condition_category_share_Respiratory',
       'treatment_type_share_Chemotherapy',
       'treatment_type_share_Consultation',
       'treatment_type_share_Day Case Surge

In [27]:
yearly.sample(5)

Unnamed: 0,Claimant Unique ID,Claim_Year,total_claim_amount,mean_monthly_amount,std_monthly_amount,coef_var,skew_monthly_amount,claim_month_entropy,months_with_claims,max_monthly_amount,min_monthly_amount,Q1_amount,Q2_amount,Q3_amount,Q4_amount,lag1_total,yoy_change,condition_category_share_Cardiovascular,condition_category_share_Endocrine,condition_category_share_Gastroenterology,condition_category_share_General,condition_category_share_General Surgery,condition_category_share_Mental Health,condition_category_share_Musculoskeletal,condition_category_share_Oncology,condition_category_share_Ophthalmology,condition_category_share_Preventive,condition_category_share_Respiratory,treatment_type_share_Chemotherapy,treatment_type_share_Consultation,treatment_type_share_Day Case Surgery,treatment_type_share_Diagnostic Imaging,treatment_type_share_Diagnostic Tests,treatment_type_share_Emergency Treatment,treatment_type_share_Inpatient Surgery,treatment_type_share_Mental Health Therapy,treatment_type_share_Outpatient Surgery,treatment_type_share_Physiotherapy,treatment_type_share_Preventive Care,treatment_type_share_Radiotherapy,treatment_type_share_Rehabilitation,treatment_type_share_Specialist Consultation,treatment_location_share_Addenbrookes Hospital,treatment_location_share_BMI The Alexandra Hospital,treatment_location_share_BMI The Blackheath Hospital,treatment_location_share_BMI The Bristol Hospital,treatment_location_share_BMI The Priory Hospital,treatment_location_share_BMI The Ross Hall Hospital,treatment_location_share_Day Surgery Unit,treatment_location_share_Diagnostic Centre,treatment_location_share_Guy's Hospital,treatment_location_share_Kings College Hospital,treatment_location_share_London Bridge Hospital,treatment_location_share_Nuffield Health Birmingham,treatment_location_share_Nuffield Health Glasgow,treatment_location_share_Outpatient Clinic,treatment_location_share_Royal Berkshire Hospital,treatment_location_share_Spire Edinburgh Hospital,treatment_location_share_Spire Leeds Hospital,treatment_location_share_Spire Liverpool Hospital,treatment_location_share_Spire Manchester Hospital,treatment_location_share_Spire Thames Valley Hospital,treatment_location_share_St Thomas Hospital,treatment_location_share_The Churchill Hospital Oxford,treatment_location_share_The Portland Hospital,treatment_location_share_The Wellington Hospital,Client Name,Client Identifier,Claimant Gender,Claimant Year of Birth,Scheme Category/ Section Name,Provider Type,age,target_next_year,is_high_cost_next_year
80226,MEM00050744-04,2022,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19112.62,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Medicash,CLI0014,Other,2005,Advanced Cover,Private Hospital,17,,0
68885,MEM00043501-01,2022,1147012.63,5143.554395,16710.943011,3.24891,6.913453,3.789733,219.0,155938.49,0.0,512018.57,545755.43,89238.63,0.0,1569401.66,-0.2691402,0.159119,0.010752,0.032239,0.00667,0.04911,0.010969,0.226316,0.477681,0.01718,0.006704,0.00326,0.058589,0.026618,0.010837,0.003853,0.003733,0.017107,0.41121,0.011927,0.022336,0.281982,0.028874,0.029704,0.060561,0.032669,0.028454,0.002711,0.012912,0.042137,0.199421,0.255196,0.010354,0.01252,0.011452,0.031255,0.029398,0.006295,0.009387,0.011359,0.025462,0.0,0.05947,0.012018,0.033323,0.060991,0.023066,0.041653,0.060665,0.020502,HSF Health Plan,CLI0012,Female,1971,Platinum Plan,Private Hospital,51,,0
13079,MEM00008172-04,2021,1347.54,112.295,372.440381,3.316625,3.015113,0.0,1.0,1347.54,0.0,0.0,1347.54,0.0,0.0,0.0,1347540000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bupa UK,CLI0001,Male,2005,Standard Plan,NHS Hospital,16,,0
82692,MEM00052308-04,2023,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,529.55,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Westfield Health,CLI0013,Female,2005,Complete Care,Rehabilitation Centre,18,,0
23142,MEM00014473-01,2021,298.72,24.893333,82.561846,3.316625,3.015113,0.0,1.0,298.72,0.0,0.0,0.0,298.72,0.0,0.0,298720000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Simplyhealth,CLI0011,Male,1998,Standard Plan,Diagnostic Centre,23,,0


In [30]:
import pandas as pd
import numpy as np
from scipy.stats import entropy
from sklearn.linear_model import LinearRegression

# === Input: member_year DataFrame (one row per member-year) ===
# Required base columns (expected): ['member_id', 'Claim_Year', 'total_claim_amount', 'claim_count']
# If your column names differ, adapt the variables below.

df = yearly.copy()  # working copy

# === Column name config (change if your names differ) ===
member_col = 'Claimant Unique ID' if 'Claimant Unique ID' in df.columns else 'member_id'
year_col = 'Claim_Year' if 'Claim_Year' in df.columns else 'year'
total_col = 'total_claim_amount' if 'total_claim_amount' in df.columns else 'total_amount'
count_col = 'claim_count' if 'claim_count' in df.columns else 'months_with_claims'
age_col = 'age' if 'age' in df.columns else None
tenure_col = 'tenure_years' if 'tenure_years' in df.columns else None

# Ensure expected columns exist
for c in [member_col, year_col, total_col, count_col]:
    if c not in df.columns:
        raise KeyError(f"Required column missing: {c}")

# Sort
df = df.sort_values([member_col, year_col]).reset_index(drop=True)

# === 1) Lag features (lag1..lag5) ===
max_lag = 5
for lag in range(1, max_lag + 1):
    df[f'lag{lag}_total'] = df.groupby(member_col)[total_col].shift(lag).fillna(0.0)
    df[f'lag{lag}_count'] = df.groupby(member_col)[count_col].shift(lag).fillna(0).astype(int)

# === 2) Rolling trailing windows (use prior years only) ===
def add_rolling(group, window):
    # group is member's rows sorted by year
    s_total = group[total_col]
    s_count = group[count_col]
    # trailing: window years, exclude current year -> shift(1)
    group[f'rolling_{window}yr_sum'] = s_total.rolling(window=window, min_periods=1).sum().shift(1).fillna(0.0)
    group[f'rolling_{window}yr_avg'] = s_total.rolling(window=window, min_periods=1).mean().shift(1).fillna(0.0)
    group[f'rolling_{window}yr_max'] = s_total.rolling(window=window, min_periods=1).max().shift(1).fillna(0.0)
    group[f'rolling_{window}yr_count'] = s_count.rolling(window=window, min_periods=1).sum().shift(1).fillna(0).astype(int)
    return group

df = df.groupby(member_col, group_keys=False).apply(lambda g: add_rolling(g, 3))
df = df.groupby(member_col, group_keys=False).apply(lambda g: add_rolling(g, 5))

# === 3) Volatility / consistency features ===
# rolling std (trailing)
df['std_3yr'] = df.groupby(member_col)[total_col].rolling(window=3, min_periods=1).std().shift(1).reset_index(level=0, drop=True).fillna(0.0)
df['std_5yr'] = df.groupby(member_col)[total_col].rolling(window=5, min_periods=1).std().shift(1).reset_index(level=0, drop=True).fillna(0.0)

# coefficient of variation (std/mean) using rolling 3yr avg
df['cv_3yr'] = df['std_3yr'] / (df['rolling_3yr_avg'].replace(0, np.nan))
df['cv_3yr'] = df['cv_3yr'].fillna(0.0)

# === 4) Years since last claim & claim-free counts ===
def years_since_last_claim(series_total, years):
    # series_total is a pd.Series of total per year (sorted)
    last_claim = None
    out = []
    for y, v in zip(years, series_total):
        if v > 0:
            last_claim = y
            out.append(0)
        else:
            out.append((y - last_claim) if last_claim is not None else np.nan)
    return out

def add_years_since(group):
    years = group[year_col].values
    totals = group[total_col].values
    group['years_since_last_claim'] = years_since_last_claim(totals, years)
    # claim_free_years_in_last_5: number of years with zero in previous 5 years (exclude current)
    group['claim_free_years_last_5'] = group[total_col].rolling(window=5, min_periods=1).apply(lambda x: np.sum(x.shift(0)==0) if len(x)>0 else 0).shift(1).fillna(0).astype(int)
    return group

df = df.groupby(member_col, group_keys=False).apply(add_years_since)

# Replace NaN years_since_last_claim with large number (e.g., 99)
df['years_since_last_claim'] = df['years_since_last_claim'].fillna(99).astype(int)

# === 5) Longest claim streak (consecutive years with claim>0) per member (same for each year) ===
def longest_streak(arr):
    cur = 0
    best = 0
    for v in arr:
        if v > 0:
            cur += 1
            best = max(best, cur)
        else:
            cur = 0
    return best

streak = df.groupby(member_col)[total_col].apply(lambda s: longest_streak(s.values)).reset_index().rename(columns={total_col:'longest_claim_streak'})
df = df.merge(streak, on=member_col, how='left')

# === 6) Percent active years (proportion of years with any claim up to current year) ===
def pct_active_years(group):
    cum_nonzero = (group[total_col] > 0).cumsum()
    yrs = np.arange(1, len(group)+1)
    return ( (group[total_col] > 0).cumsum() / yrs )

df['pct_active_years'] = df.groupby(member_col, group_keys=False).apply(lambda g: ((g[total_col] > 0).cumsum() / np.arange(1, len(g)+1))).explode().values
# above returns a float per row

# === 7) Average per claim and claim severity metrics ===
df['avg_amount_per_claim'] = df.apply(lambda r: (r[total_col] / r[count_col]) if r[count_col] > 0 else 0.0, axis=1)
df['max_to_mean_ratio'] = df.apply(lambda r: (r['max_claim_amount'] / (r['mean_claim_amount']+1e-9)) if 'max_claim_amount' in r.index and 'mean_claim_amount' in r.index else 0.0, axis=1)

# === 8) Inpatient/outpatient rates if present (robust) ===
inpatient_cols = [c for c in df.columns if 'inpatient' in str(c).lower()]
outpatient_cols = [c for c in df.columns if 'outpatient' in str(c).lower()]
# If you have count_type_<value> columns from earlier, compute rates
if inpatient_cols:
    # sum inpatient-type counts (if multiple labels)
    df['inpatient_count'] = df[inpatient_cols].sum(axis=1)
    df['inpatient_rate'] = df['inpatient_count'] / (df[count_col].replace(0, np.nan))
    df['inpatient_rate'] = df['inpatient_rate'].fillna(0.0)
if outpatient_cols:
    df['outpatient_count'] = df[outpatient_cols].sum(axis=1)
    df['outpatient_rate'] = df['outpatient_count'] / (df[count_col].replace(0, np.nan))
    df['outpatient_rate'] = df['outpatient_rate'].fillna(0.0)

# === 9) Entropy across share features (if you have *_share_ columns) ===
share_cols = [c for c in df.columns if '_share_' in c]
if share_cols:
    def row_entropy(row):
        vals = row[share_cols].values.astype(float)
        # clip small negatives and normalize
        vals = np.clip(vals, 0, None)
        s = vals.sum()
        if s <= 0:
            return 0.0
        probs = vals / s
        return entropy(probs)
    df['share_entropy'] = df.apply(row_entropy, axis=1)
else:
    df['share_entropy'] = 0.0

# === 10) Trend slope over last 3 years (linear regression slope for total_col) ===
def slope_last_k(series, years, k=3):
    # use last k observations excluding current year
    if len(series) < 2:
        return 0.0
    x = np.array(years).reshape(-1,1)
    y = np.array(series)
    # take trailing k points (exclude current by shifting in caller if needed)
    if len(y) > k:
        x = x[-k:]
        y = y[-k:]
    try:
        lr = LinearRegression()
        lr.fit(x, y)
        return float(lr.coef_[0])
    except Exception:
        return 0.0

def add_slope(group):
    yrs = group[year_col].values
    totals = group[total_col].values
    slopes = []
    for i in range(len(group)):
        # use previous 3 years only -> indices up to i-1
        if i == 0:
            slopes.append(0.0)
        else:
            end = i  # exclusive current
            start = max(0, end-3)
            s = totals[start:end]
            yrs_s = yrs[start:end]
            slopes.append(slope_last_k(s, yrs_s, k=3))
    group['slope_3yr'] = slopes
    return group

df = df.groupby(member_col, group_keys=False).apply(add_slope).reset_index(drop=True)

# === 11) Interaction features (age × prior cost, tenure × prior cost) ===
if age_col and age_col in df.columns:
    df['age_x_lag1'] = df[age_col] * df['lag1_total']
if tenure_col and tenure_col in df.columns:
    df['tenure_x_lag1'] = df[tenure_col] * df['lag1_total']

# === 12) Metadata: first_claim_year, last_claim_year, years_active_total ===
first_year = df[df[total_col] > 0].groupby(member_col)[year_col].min().reset_index().rename(columns={year_col:'first_claim_year'})
last_year = df[df[total_col] > 0].groupby(member_col)[year_col].max().reset_index().rename(columns={year_col:'last_claim_year'})
df = df.merge(first_year, on=member_col, how='left').merge(last_year, on=member_col, how='left')
df['years_active_total'] = (df[year_col] - df['first_claim_year']).clip(lower=0)

# fill NaNs for metadata for members with no claims at all
df['first_claim_year'] = df['first_claim_year'].fillna(9999).astype(int)
df['last_claim_year'] = df['last_claim_year'].fillna(0).astype(int)
df['years_active_total'] = df['years_active_total'].fillna(0).astype(int)

# === Final cleanup: fill NaNs for new numeric cols ===
new_num_cols = [c for c in df.columns if c not in [member_col, year_col] and df[c].dtype.kind in 'fiu']
for c in new_num_cols:
    df[c] = df[c].fillna(0)

# === Output: df (enhanced member-year) ===
enhanced_member_year = df.copy()
print("Enhanced member-year shape:", enhanced_member_year.shape)
#print("New columns added:", [c for c in enhanced_member_year.columns if c not in member_year.columns][:50])
# assign back if you want
#enhanced_member_year


Enhanced member-year shape: (83961, 111)


In [31]:
enhanced_member_year.columns

Index(['Claimant Unique ID', 'Claim_Year', 'total_claim_amount',
       'mean_monthly_amount', 'std_monthly_amount', 'coef_var',
       'skew_monthly_amount', 'claim_month_entropy', 'months_with_claims',
       'max_monthly_amount',
       ...
       'inpatient_count', 'inpatient_rate', 'outpatient_count',
       'outpatient_rate', 'share_entropy', 'slope_3yr', 'age_x_lag1',
       'first_claim_year', 'last_claim_year', 'years_active_total'],
      dtype='object', length=111)

In [32]:
enhanced_member_year.sample(5)

Unnamed: 0,Claimant Unique ID,Claim_Year,total_claim_amount,mean_monthly_amount,std_monthly_amount,coef_var,skew_monthly_amount,claim_month_entropy,months_with_claims,max_monthly_amount,min_monthly_amount,Q1_amount,Q2_amount,Q3_amount,Q4_amount,lag1_total,yoy_change,condition_category_share_Cardiovascular,condition_category_share_Endocrine,condition_category_share_Gastroenterology,condition_category_share_General,condition_category_share_General Surgery,condition_category_share_Mental Health,condition_category_share_Musculoskeletal,condition_category_share_Oncology,condition_category_share_Ophthalmology,condition_category_share_Preventive,condition_category_share_Respiratory,treatment_type_share_Chemotherapy,treatment_type_share_Consultation,treatment_type_share_Day Case Surgery,treatment_type_share_Diagnostic Imaging,treatment_type_share_Diagnostic Tests,treatment_type_share_Emergency Treatment,treatment_type_share_Inpatient Surgery,treatment_type_share_Mental Health Therapy,treatment_type_share_Outpatient Surgery,treatment_type_share_Physiotherapy,treatment_type_share_Preventive Care,treatment_type_share_Radiotherapy,treatment_type_share_Rehabilitation,treatment_type_share_Specialist Consultation,treatment_location_share_Addenbrookes Hospital,treatment_location_share_BMI The Alexandra Hospital,treatment_location_share_BMI The Blackheath Hospital,treatment_location_share_BMI The Bristol Hospital,treatment_location_share_BMI The Priory Hospital,treatment_location_share_BMI The Ross Hall Hospital,treatment_location_share_Day Surgery Unit,treatment_location_share_Diagnostic Centre,treatment_location_share_Guy's Hospital,treatment_location_share_Kings College Hospital,treatment_location_share_London Bridge Hospital,treatment_location_share_Nuffield Health Birmingham,treatment_location_share_Nuffield Health Glasgow,treatment_location_share_Outpatient Clinic,treatment_location_share_Royal Berkshire Hospital,treatment_location_share_Spire Edinburgh Hospital,treatment_location_share_Spire Leeds Hospital,treatment_location_share_Spire Liverpool Hospital,treatment_location_share_Spire Manchester Hospital,treatment_location_share_Spire Thames Valley Hospital,treatment_location_share_St Thomas Hospital,treatment_location_share_The Churchill Hospital Oxford,treatment_location_share_The Portland Hospital,treatment_location_share_The Wellington Hospital,Client Name,Client Identifier,Claimant Gender,Claimant Year of Birth,Scheme Category/ Section Name,Provider Type,age,target_next_year,is_high_cost_next_year,lag1_count,lag2_total,lag2_count,lag3_total,lag3_count,lag4_total,lag4_count,lag5_total,lag5_count,rolling_3yr_sum,rolling_3yr_avg,rolling_3yr_max,rolling_3yr_count,rolling_5yr_sum,rolling_5yr_avg,rolling_5yr_max,rolling_5yr_count,std_3yr,std_5yr,cv_3yr,years_since_last_claim,claim_free_years_last_5,longest_claim_streak,pct_active_years,avg_amount_per_claim,max_to_mean_ratio,inpatient_count,inpatient_rate,outpatient_count,outpatient_rate,share_entropy,slope_3yr,age_x_lag1,first_claim_year,last_claim_year,years_active_total
50054,MEM00031471-02,2023,1251.81,104.3175,345.982007,3.316625,3.015113,0.0,1.0,1251.81,0.0,0.0,0.0,0.0,1251.81,0.0,1251810000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Aviva Health,CLI0004,Female,1968,Executive Cover,NHS Hospital,55,0.0,0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,1,1,0.5,1251.81,0.0,0.0,0.0,0.0,0.0,1.098612,0.0,0.0,2023,2023,0
30854,MEM00019330-02,2021,1323.14,110.261667,365.696577,3.316625,3.015113,0.0,1.0,1323.14,0.0,0.0,1323.14,0.0,0.0,0.0,1323140000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benenden Health,CLI0007,Female,1979,Advanced Cover,Outpatient Clinic,42,0.0,0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,1,1,0.5,1323.14,0.0,0.0,0.0,1.0,1.0,1.098612,0.0,0.0,2021,2021,0
70800,MEM00044730-03,2021,458.66,38.221667,126.766927,3.316625,3.015113,0.0,1.0,458.66,0.0,0.0,0.0,0.0,458.66,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Westfield Health,CLI0013,Male,2005,Gold Plan,Outpatient Clinic,16,0.0,0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,2404.629747,2404.629747,0.0,0,0,1,1.0,458.66,0.0,0.0,0.0,0.0,0.0,1.098612,0.0,0.0,2021,2021,0
10630,MEM00006616-02,2023,596.64,49.72,164.902585,3.316625,3.015113,0.0,1.0,596.64,0.0,0.0,0.0,596.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,HSF Health Plan,CLI0012,Male,1986,Elite Plan,Private Hospital,37,0.0,0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,3737.738161,3737.738161,0.0,0,0,1,1.0,596.64,0.0,0.0,0.0,0.0,0.0,1.098612,0.0,0.0,2023,2023,0
3532,MEM00002206-01,2023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,495.11,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,AXA Health,CLI0002,Male,1993,Platinum Plan,Mental Health Facility,30,0.0,0,1,0.0,0,0.0,0,0.0,0,0.0,0,495.11,495.11,495.11,1,495.11,495.11,495.11,1,0.0,0.0,0.0,1,0,1,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14853.3,2022,2022,1


In [45]:
enhanced_member_year.drop('target_next_year',axis=1,inplace=True)

In [35]:
enhanced_member_year.to_parquet("enhanced_emem_year_111.parquet")

In [37]:
member_col = 'Claimant Unique ID'  # or your member ID col
enhanced_member_year = enhanced_member_year.sort_values([member_col, 'Claim_Year']).reset_index(drop=True)
enhanced_member_year['target_next_year_total'] = enhanced_member_year.groupby(member_col)['total_claim_amount'].shift(-1).fillna(0)


In [46]:
import lightgbm as lgb
import xgboost as xgb
from tensorflow import keras
from tensorflow.keras import layers

In [47]:
# === dependencies ===
# pip install lightgbm xgboost tensorflow scikit-learn joblib

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import Ridge
import joblib



# -------------------------
# CONFIG / assumptions
# -------------------------
# Input: member_year DataFrame with features and target
# Must set these names:
DF = enhanced_member_year  # your dataframe
TARGET = 'target_next_year_total'  # numeric target (>=0)
ID_COL = 'Claimant Unique ID'      # unique member id col (if present)
YEAR_COL = 'Claim_Year'            # year col (if present)

# temporal split year (validate on this year). e.g., train years <= 2022, validate 2023
SPLIT_YEAR = 2023  # adjust as desired

USE_LOG_TARGET = True  # True => model on log1p(target), predictions will be inverse-transformed

RANDOM_STATE = 42
MODEL_DIR = './models'
os.makedirs(MODEL_DIR, exist_ok=True)

# -------------------------
# 1. Prepare data (features / target)
# -------------------------
df = DF.copy()

# drop rows where target is NaN
df = df[~df[TARGET].isna()].reset_index(drop=True)

# optionally drop identifier/year columns from X
drop_cols = [TARGET]
if ID_COL in df.columns:
    drop_cols.append(ID_COL)
if YEAR_COL in df.columns:
    drop_cols.append(YEAR_COL)

# Candidate feature columns = all numeric + selected categoricals
all_cols = [c for c in df.columns if c not in drop_cols]

# Auto-detect categorical vs numeric
# Treat object and category dtype as categorical
cat_cols = [c for c in all_cols if df[c].dtype == 'object' or str(df[c].dtype).startswith('category')]
num_cols = [c for c in all_cols if c not in cat_cols]

print(f"Detected {len(num_cols)} numeric cols and {len(cat_cols)} categorical cols.")

# Prepare X,y and a temporal split
if YEAR_COL in df.columns:
    train_mask = df[YEAR_COL] < SPLIT_YEAR
    val_mask = df[YEAR_COL] == SPLIT_YEAR
    X_train_df = df.loc[train_mask, all_cols].copy()
    y_train = df.loc[train_mask, TARGET].copy()
    X_val_df = df.loc[val_mask, all_cols].copy()
    y_val = df.loc[val_mask, TARGET].copy()
    # If val set is empty, fallback to random split
    if len(X_val_df) == 0 or X_val_df.shape[0] < 100:
        print("Temporal split produced too-small validation set; using random split (80/20).")
        X_train_df, X_val_df, y_train, y_val = train_test_split(df[all_cols], df[TARGET], test_size=0.2, random_state=RANDOM_STATE)
else:
    X_train_df, X_val_df, y_train, y_val = train_test_split(df[all_cols], df[TARGET], test_size=0.2, random_state=RANDOM_STATE)

# Optionally log-transform the target to stabilize variance
if USE_LOG_TARGET:
    y_train_trans = np.log1p(y_train)
    y_val_trans = np.log1p(y_val)
else:
    y_train_trans = y_train.copy()
    y_val_trans = y_val.copy()

# -------------------------
# 2. Preprocessing pipelines
# -------------------------
# Numeric pipeline: impute median + standard scale
num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

# Categorical pipeline: impute constant + one-hot (sparse disabled to keep dense arrays)
cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='__MISSING__')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
], remainder='drop', sparse_threshold=1)

# Fit transformer on training set and transform both
preprocessor.fit(X_train_df)
X_train = preprocessor.transform(X_train_df)
X_val = preprocessor.transform(X_val_df)

# feature names (optional)
# get feature names after transformation (approximate)
ohe_feature_names = []
if cat_cols:
    ohe = preprocessor.named_transformers_['cat'].named_steps['ohe']
    cat_names = ohe.get_feature_names_out(cat_cols)
    ohe_feature_names = list(cat_names)
num_feature_names = num_cols
feature_names = num_feature_names + ohe_feature_names

print("Feature matrix shapes:", X_train.shape, X_val.shape)

# -------------------------
# 3. Train LightGBM (Tweedie)
# -------------------------
lgb_train = lgb.Dataset(X_train, label=y_train_trans)
lgb_val = lgb.Dataset(X_val, label=y_val_trans, reference=lgb_train)

lgb_params = {
    'objective': 'tweedie',           # tweedie is appropriate for insurance-like data
    'tweedie_variance_power': 1.5,    # between 1 (Poisson) and 2 (Gamma) — tune this
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'min_data_in_leaf': 50,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbosity': -1,
    'seed': RANDOM_STATE
}

print("Training LightGBM (tweedie)...")
lgb_model = lgb.train(
    lgb_params,
    lgb_train,
    num_boost_round=2000,
    valid_sets=[lgb_train, lgb_val],
    callbacks=[
        lgb.early_stopping(100),
        lgb.log_evaluation(100)
    ]
)

# predict and inverse transform if needed
lgb_pred_val = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
if USE_LOG_TARGET:
    lgb_pred_val = np.expm1(lgb_pred_val)  # inverse log1p

# -------------------------
# 4. Train XGBoost (regression)
# -------------------------
# XGBoost expects 2D numpy arrays
dtrain = xgb.DMatrix(X_train, label=(y_train_trans if not USE_LOG_TARGET else np.log1p(y_train)))
dval = xgb.DMatrix(X_val, label=(y_val_trans if not USE_LOG_TARGET else np.log1p(y_val)))

xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': RANDOM_STATE,
    'verbosity': 0
}

print("Training XGBoost...")
xgb_model = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=2000,
    evals=[(dtrain, 'train'), (dval, 'val')],
    early_stopping_rounds=100,
    verbose_eval=100
)

# --- Predict using XGBoost (compatible with all versions) ---
try:
    # XGBoost >= 2.0: use best_iteration instead of best_ntree_limit
    n_trees = getattr(xgb_model, "best_iteration", None)
    if n_trees is not None:
        xgb_pred_val = xgb_model.predict(dval, iteration_range=(0, n_trees))
    else:
        # fallback (older versions)
        xgb_pred_val = xgb_model.predict(dval)
except Exception as e:
    print("⚠️ Warning: couldn't use best_iteration, falling back to full model:", e)
    xgb_pred_val = xgb_model.predict(dval)
if USE_LOG_TARGET:
    xgb_pred_val = np.expm1(xgb_pred_val)

# -------------------------
# 5. Train Neural Network (Keras)
# -------------------------
# Simple dense NN with dropout; tune architecture/hyperparams as needed
input_dim = X_train.shape[1]

def build_nn(input_dim):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(512, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.Dense(1, activation='linear')
    ])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),
                  loss='mse',
                  metrics=['mae'])
    return model

print("Training Keras NN (may take time)...")
nn = build_nn(input_dim)
# use early stopping
es = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
history = nn.fit(X_train, (y_train_trans if not USE_LOG_TARGET else np.log1p(y_train)),
                 validation_data=(X_val, (y_val_trans if not USE_LOG_TARGET else np.log1p(y_val))),
                 epochs=200, batch_size=1024, callbacks=[es], verbose=2)

nn_pred_val = nn.predict(X_val).reshape(-1)
if USE_LOG_TARGET:
    nn_pred_val = np.expm1(nn_pred_val)

# -------------------------
# 6. Evaluation function
# -------------------------
def evaluate(y_true, y_pred, name="model"):
    mae = mean_absolute_error(y_true, y_pred)
    try:
        rmse = mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
    # fallback for older sklearn versions
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"=== {name} ===")
    print(f"MAE: {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R2: {r2:,.4f}")
    return {'mae': mae, 'rmse': rmse, 'r2': r2}

print("\nValidation results (on actual target scale):")
lgb_res = evaluate(y_val.values, lgb_pred_val, name="LightGBM (Tweedie)")
xgb_res = evaluate(y_val.values, xgb_pred_val, name="XGBoost")
nn_res = evaluate(y_val.values, nn_pred_val, name="NeuralNet")

# -------------------------
# 7. Feature importance (LightGBM)
# -------------------------
# If we built feature_names earlier, use them; otherwise use indices
try:
    importances = lgb_model.feature_importance(importance_type='gain')
    feat_names = feature_names if 'feature_names' in locals() else [f'f{i}' for i in range(X_train.shape[1])]
    imp_df = pd.DataFrame({'feature': feat_names, 'gain': importances})
    imp_df = imp_df.sort_values('gain', ascending=False).reset_index(drop=True)
    print("\nTop 20 LightGBM features by gain:")
    print(imp_df.head(20))
except Exception as e:
    print("Could not compute feature importances:", e)

# -------------------------
# 8. Save models & preprocessor
# -------------------------
joblib.dump(preprocessor, os.path.join(MODEL_DIR, 'preprocessor.joblib'))
lgb_model.save_model(os.path.join(MODEL_DIR, 'lgb_model.txt'))
xgb_model.save_model(os.path.join(MODEL_DIR, 'xgb_model.json'))
nn.save(os.path.join(MODEL_DIR, 'nn_model.keras'))

print(f"Models and preprocessor saved to {MODEL_DIR}")

# -------------------------
# 9. Quick predict function (wrap)
# -------------------------
def predict_on_new(df_new):
    """
    df_new: raw dataframe with same feature columns as training input (all_cols)
    returns: predicted target on original scale
    """
    X_new = preprocessor.transform(df_new[all_cols])
    pred_lgb = lgb_model.predict(X_new, num_iteration=lgb_model.best_iteration)
    pred_xgb = xgb_model.predict(xgb.DMatrix(X_new), ntree_limit=xgb_model.best_ntree_limit)
    pred_nn = nn.predict(X_new).reshape(-1)
    if USE_LOG_TARGET:
        pred_lgb = np.expm1(pred_lgb); pred_xgb = np.expm1(pred_xgb); pred_nn = np.expm1(pred_nn)
    return {'lgb': pred_lgb, 'xgb': pred_xgb, 'nn': pred_nn}

# Example usage:
# preds = predict_on_new(X_val_df)  # returns dictionary of numpy arrays


Detected 103 numeric cols and 5 categorical cols.
Feature matrix shapes: (41706, 152) (16920, 152)
Training LightGBM (tweedie)...
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.907035	valid_1's rmse: 0.850249
[200]	training's rmse: 0.869927	valid_1's rmse: 0.857974
Early stopping, best iteration is:
[113]	training's rmse: 0.901073	valid_1's rmse: 0.849236
Training XGBoost...
[0]	train-rmse:3.87935	val-rmse:3.90085
[100]	train-rmse:0.89534	val-rmse:3.85861
[109]	train-rmse:0.89166	val-rmse:3.84688
Training Keras NN (may take time)...
Epoch 1/200
41/41 - 2s - 42ms/step - loss: 8.5804 - mae: 2.0941 - val_loss: 1.7064 - val_mae: 0.9072
Epoch 2/200
41/41 - 1s - 15ms/step - loss: 1.6261 - mae: 0.8165 - val_loss: 0.8070 - val_mae: 0.5081
Epoch 3/200
41/41 - 1s - 14ms/step - loss: 1.2239 - mae: 0.6499 - val_loss: 0.7782 - val_mae: 0.4586
Epoch 4/200
41/41 - 1s - 15ms/step - loss: 1.1440 - mae: 0.6158 - val_loss: 0.8008 - val_mae: 0.4594
Epoch 5/200
41/41