In [7]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
df = pd.read_csv('/content/drive/MyDrive/AutoInsurance.csv')
df.head(5)

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,3/2/2011,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


In [8]:
df['Effective To Date'] = pd.to_datetime(df['Effective To Date'])
df['TenureDays'] = (pd.Timestamp('2012-01-01') - df['Effective To Date']).dt.days

df['Response'] = df['Response'].map({'Yes': 1, 'No': 0})

df = df.drop(columns=['Customer', 'Effective To Date'], errors='ignore')

  df['Effective To Date'] = pd.to_datetime(df['Effective To Date'])


In [12]:
cat_cols = df.select_dtypes('object').columns.tolist()          # string columns
num_cols = df.select_dtypes(exclude='object').columns.tolist()  # numerical columns

num_cols.remove('Customer Lifetime Value')
num_cols.remove('Response')

print("Categorical columns:", cat_cols[:6], "...", f"(total {len(cat_cols)})")
print("Numeric columns     :", num_cols[:6], "...", f"(total {len(num_cols)})")

Categorical columns: ['State', 'Coverage', 'Education', 'EmploymentStatus', 'Gender', 'Location Code'] ... (total 13)
Numeric columns     : ['Income', 'Monthly Premium Auto', 'Months Since Last Claim', 'Months Since Policy Inception', 'Number of Open Complaints', 'Number of Policies'] ... (total 8)


In [14]:
from sklearn.model_selection import train_test_split

X = df[cat_cols + num_cols] #feature
y = df['Response']  #target

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

print("Train size:", X_train.shape)
print("Test size :", X_test.shape)

Train size: (6393, 21)
Test size : (2741, 21)


In [30]:
!pip install lightgbm -q

from lightgbm import LGBMClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score


ohe = OneHotEncoder(handle_unknown='ignore', drop='first')
scaler = StandardScaler()

preprocessor = ColumnTransformer([
    ('cat', ohe, cat_cols),
    ('num', scaler, num_cols)
])


classifier = Pipeline([
    ('prep', preprocessor),
    ('model', LGBMClassifier(
        n_estimators=300,
        learning_rate=0.06,
        class_weight='balanced',
        random_state=42
    ))
])

classifier.fit(X_train, y_train)


proba = classifier.predict_proba(X_test)[:, 1]

roc_auc = roc_auc_score(y_test, proba)
print("✅ ROC-AUC Score:", round(roc_auc, 3))


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



[LightGBM] [Info] Number of positive: 915, number of negative: 5478
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001378 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 956
[LightGBM] [Info] Number of data points in the train set: 6393, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
✅ ROC-AUC Score: 1.0



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [31]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

X_reg = df.drop(columns=['Customer Lifetime Value'])
y_reg = df['Customer Lifetime Value']


Xtr, Xte, ytr, yte = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)

reg = Pipeline([
    ('prep', preprocessor),
    ('model', LGBMRegressor(n_estimators=300, learning_rate=0.06, random_state=42))
])

reg.fit(Xtr, ytr)

clv_pred = reg.predict(X_test)
print('✅ CLV Model – MAE (₹):', round(mean_absolute_error(yte, reg.predict(Xte)), 0))



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001868 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 962
[LightGBM] [Info] Number of data points in the train set: 6393, number of used features: 50
[LightGBM] [Info] Start training from score 7975.945268



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



✅ CLV Model – MAE (₹): 1584.0


In [32]:
df['Purchase_Probability'] = classifier.predict_proba(X)[:, 1]

df['Predicted_CLV'] = reg.predict(X)

df['Predicted_CLV'] = pd.to_numeric(df['Predicted_CLV'], errors='coerce')



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [33]:
df['CLV_Score'] = (
    (df['Predicted_CLV'] - df['Predicted_CLV'].min()) /
    (df['Predicted_CLV'].max() - df['Predicted_CLV'].min())
).fillna(0)

df['Probability_Score'] = df['Purchase_Probability']


In [34]:
coverage_map = {'Basic': 0.0, 'Extended': 0.5, 'Premium': 1.0}
df['Coverage_Numeric'] = df['Coverage'].map(coverage_map)

df['Gap_Score'] = 1 - df['Coverage_Numeric']


In [35]:
df['Recommendation_Score'] = (
    0.4 * df['Probability_Score'] +
    0.4 * df['CLV_Score'] +
    0.2 * df['Gap_Score']
)

if 'Customer' not in df.columns:
    df['Customer'] = df.index.astype(str)

recommendations = df.sort_values('Recommendation_Score', ascending=False).head(10).copy()
recommendations.reset_index(drop=True, inplace=True)


In [36]:
def generate_recommendation(row):
    cov_text = {0.0: "Basic coverage", 0.5: "Extended coverage", 1.0: "Premium coverage"}.get(
        row['Coverage_Numeric'], "Unknown coverage"
    )


    if row['Purchase_Probability'] > 0.8:
        intent_text = "is highly likely to accept an upgrade"
    elif row['Purchase_Probability'] > 0.5:
        intent_text = "has a moderate chance of accepting an upgrade"
    else:
        intent_text = "shows limited interest in new products"


    if row['CLV_Score'] > 0.7:
        value_text = "and represents high customer lifetime value."
    elif row['CLV_Score'] > 0.4:
        value_text = "and represents mid-level customer lifetime value."
    else:
        value_text = "but has lower predicted lifetime value."

    return f"{row['Customer']} currently holds {cov_text}, {intent_text} {value_text}"


recommendations['Explanation'] = recommendations.apply(generate_recommendation, axis=1)

recommendations_display = recommendations[
    ['Customer',
     'Recommendation_Score',
     'Purchase_Probability',
     'Predicted_CLV',
     'Gap_Score',
     'Explanation']
]

recommendations_display


Unnamed: 0,Customer,Recommendation_Score,Purchase_Probability,Predicted_CLV,Gap_Score,Explanation
0,6904,0.796615,0.957729,34898.037277,1.0,"6904 currently holds Basic coverage, is highly..."
1,5767,0.796615,0.957729,34898.037277,1.0,"5767 currently holds Basic coverage, is highly..."
2,373,0.791646,0.977828,32857.418794,1.0,"373 currently holds Basic coverage, is highly ..."
3,7810,0.790677,0.977194,32745.226071,1.0,"7810 currently holds Basic coverage, is highly..."
4,6008,0.789864,0.977194,32617.678424,1.0,"6008 currently holds Basic coverage, is highly..."
5,408,0.787738,0.953241,33787.240108,1.0,"408 currently holds Basic coverage, is highly ..."
6,4426,0.78589,0.951069,33633.543095,1.0,"4426 currently holds Basic coverage, is highly..."
7,6158,0.785407,0.976579,31957.142126,1.0,"6158 currently holds Basic coverage, is highly..."
8,2488,0.784614,0.976007,31868.546665,1.0,"2488 currently holds Basic coverage, is highly..."
9,5987,0.784063,0.95137,33328.098837,1.0,"5987 currently holds Basic coverage, is highly..."


In [27]:
!pip install plotly -q

import plotly.express as px

fig = px.bar(
    recommendations_display,
    x='Customer',
    y='Recommendation_Score',
    hover_data=['Explanation'],
    title='Top‑10 Recommendation Scores'
)
fig.update_layout(xaxis_title="Customer ID", yaxis_title="Recommendation Score (0‑1)")
fig.show()
