In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [5]:
df = pd.read_csv('insurance.csv')

In [7]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
81,41,82.6,1.61,22.19,True,Mysore,freelancer,High
71,38,54.1,1.81,20.25,False,Chandigarh,unemployed,Low
70,69,99.9,1.65,0.57,False,Chandigarh,retired,High
10,29,71.9,1.77,32.78,True,Chennai,business_owner,Medium
5,53,62.9,1.66,50.0,False,Kota,freelancer,Medium


In [9]:
df['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)

In [11]:
df_feat = df.copy()

In [13]:
#Feature 1 : bmi
df_feat['bmi'] = df_feat['weight'] / (df_feat['height'] ** 2)

In [15]:
#Feature 2: age group

def age_group(age):
    if age < 25:
        return "Young"
    elif age < 45:
        return "Adult"
    elif age < 60:
        return "Middle Aged"
    else:
        return "Senior"

In [17]:
df_feat['age_group'] = df_feat['age'].apply(age_group)

In [19]:
# Feature 3: Life style Risk

def lifestyle_risk(row):
    if row['age'] and row['bmi'] > 30:
        return "High"
    elif row['age'] or row['bmi'] > 27:
        return "Medium"
    else:
        return "Low"

In [21]:
df_feat['lifestyle_risk'] = df_feat.apply(lifestyle_risk, axis = 1)

In [23]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [25]:
# Feature 4 : City tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        3

In [27]:
df_feat['city_tier'] = df_feat['city'].apply(city_tier)

In [29]:
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
50,25.23,private_job,18.871661,Middle Aged,Medium,2.0,Medium
85,34.66,private_job,14.857209,Adult,Medium,1.0,Low
44,50.0,private_job,30.078125,Middle Aged,High,2.0,Medium
46,25.57,unemployed,33.672766,Adult,High,1.0,High
98,28.3,business_owner,30.521676,Adult,High,1.0,Low


In [31]:
# Select Featues and target
X = df_feat[['bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'income_lpa', 'occupation']]
y = df_feat['insurance_premium_category']

In [33]:
X

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,49.227482,Senior,High,2.0,2.92000,retired
1,30.189017,Adult,High,1.0,34.28000,freelancer
2,21.118382,Adult,Medium,2.0,36.64000,freelancer
3,45.535900,Young,High,1.0,3.34000,student
4,24.296875,Senior,Medium,2.0,3.94000,retired
...,...,...,...,...,...,...
95,21.420747,Adult,Medium,2.0,19.64000,business_owner
96,47.984483,Adult,High,1.0,34.01000,private_job
97,18.765432,Middle Aged,Medium,1.0,44.86000,freelancer
98,30.521676,Adult,High,1.0,28.30000,business_owner


In [35]:
y

0       High
1        Low
2        Low
3     Medium
4       High
       ...  
95       Low
96       Low
97       Low
98       Low
99       Low
Name: insurance_premium_category, Length: 100, dtype: object

In [37]:
# Define categorical and numerical featues
categorical_features = ['age_group', 'lifestyle_risk', 'occupation', 'city_tier']
numeric_features = ['bmi', 'income_lpa']

In [39]:
# Create Column Transformer for OHE

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', "passthrough", numeric_features)
    ]
)

In [41]:
# Create a pipeline with preprocessing and random forest classifier

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=2))
])

In [43]:
# Split data and train model

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2)
pipeline.fit(X_train, y_train)

In [45]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test,y_pred)

0.55

In [47]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
99,27.688778,Adult,Medium,1.0,28.16664,government_job
0,49.227482,Senior,High,2.0,2.92,retired
3,45.5359,Young,High,1.0,3.34,student
23,22.187855,Adult,Medium,2.0,23.71,unemployed
30,29.937519,Adult,Medium,1.0,32.97,business_owner


In [49]:
import pickle

# save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)