In [1]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
63,47,71.3,1.82,41.66,True,Gaya,business_owner,High
13,28,93.4,1.84,11.95,False,Kolkata,freelancer,Low
18,52,80.9,1.8,38.14,True,Kota,business_owner,High
28,38,101.2,1.79,11.63,False,Mumbai,unemployed,Low
64,71,117.8,1.78,1.02,False,Chandigarh,retired,High


In [4]:
df_feat = df.copy()


In [5]:
#BMI Calculation
df_feat["bmi"]=df_feat["weight"]/(df_feat["height"]**2)

In [6]:
#Age Group
def age_group(age):
    if age<25:
        return "young"
    elif age<45:
        return "adult"
    elif age<60:
        return "middle_age"
    else :
        return "senior" 

df_feat["age_group"] = df_feat["age"].apply(age_group)

In [7]:
#Lifestyle Risk
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"]>30:
        return "high"
    elif row["smoker"] or row["bmi"]>27:
        return "medium"
    else:
        return "low"

df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk,axis=1)

In [8]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [19]:
# Feature 4: City Tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3
     

df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [10]:
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
45,18.39,unemployed,33.466667,middle_age,medium,2,High
51,28.95,private_job,38.827923,middle_age,high,2,High
84,0.62,retired,28.801497,senior,medium,2,High
87,25.59837,government_job,32.03125,adult,medium,3,Low
27,34.33,private_job,35.159702,middle_age,medium,2,Medium


In [20]:
# Select features and target
X = df_feat[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
y = df_feat["insurance_premium_category"]

In [12]:
X

Unnamed: 0,bmi,income_lpa,occupation,age_group,lifestyle_risk,city_tier
0,49.227482,2.92000,retired,senior,medium,2
1,30.189017,34.28000,freelancer,adult,medium,3
2,21.118382,36.64000,freelancer,adult,low,2
3,45.535900,3.34000,student,young,high,3
4,24.296875,3.94000,retired,senior,medium,2
...,...,...,...,...,...,...
95,21.420747,19.64000,business_owner,adult,low,2
96,47.984483,34.01000,private_job,adult,medium,3
97,18.765432,44.86000,freelancer,middle_age,low,3
98,30.521676,28.30000,business_owner,adult,medium,3


In [21]:
# Define categorical and numeric features
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

In [22]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)


In [23]:
pipeline = Pipeline(
    steps=[
        ("preprocessor",preprocessor),
        ("classifier",RandomForestClassifier(random_state=2))
    ]
)

In [24]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)
pipeline.fit(X_train,y_train)

In [25]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.56

In [27]:
import pickle

pickle_model_path = "model.pkl"
with open(pickle_model_path,"wb") as f:
    pickle.dump(pipeline,f)