In [3]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn. preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [4]:
df=pd.read_csv('insurance.csv')

In [5]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low
47,55,116.4,1.87,8.34,False,Chandigarh,private_job,Medium
68,20,80.3,1.87,0.68,False,Lucknow,student,Low
55,47,75.7,1.73,24.93,False,Delhi,unemployed,Low
30,35,89.6,1.73,32.97,False,Delhi,business_owner,Low


In [6]:
df_feat=df.copy()

In [7]:
df_feat["bmi"]=df_feat["weight"]/(df_feat["height"]/100)**2

In [8]:
def age_group(age):
    if age < 18:
        return 'child'
    elif 18 <= age < 35:
        return 'youth'
    elif 35 <= age < 60:
        return 'adult'
    else:
        return 'senior' 

In [9]:
df_feat["age_group"] = df_feat["age"].apply(age_group)

In [10]:
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return 'high'
    elif row["smoker"] or row["bmi"] > 27:
        return 'medium'
    else:
        return 'low'
    

In [11]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

In [12]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
"Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
"Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
"Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
"Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
"Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
"Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri", "Jhansi", "Ajmer"]

In [13]:
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3


In [15]:
df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [16]:
df_feat.drop(columns=['age','weight','height','smoker','city'])[['income_lpa','occupation','bmi','age_group','lifestyle_risk','city_tier']]

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier
0,2.92000,retired,492274.819198,senior,medium,2
1,34.28000,freelancer,301890.172893,adult,medium,1
2,36.64000,freelancer,211183.819155,adult,medium,2
3,3.34000,student,455359.001041,youth,high,1
4,3.94000,retired,242968.750000,senior,high,2
...,...,...,...,...,...,...
95,19.64000,business_owner,214207.472920,adult,medium,2
96,34.01000,private_job,479844.830494,youth,medium,1
97,44.86000,freelancer,187654.320988,adult,medium,1
98,28.30000,business_owner,305216.761261,youth,medium,1


In [19]:
X = df_feat[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
y = df_feat["insurance_premium_category"]


In [21]:
X

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,492274.819198,senior,medium,2,2.92000,retired
1,301890.172893,adult,medium,1,34.28000,freelancer
2,211183.819155,adult,medium,2,36.64000,freelancer
3,455359.001041,youth,high,1,3.34000,student
4,242968.750000,senior,high,2,3.94000,retired
...,...,...,...,...,...,...
95,214207.472920,adult,medium,2,19.64000,business_owner
96,479844.830494,youth,medium,1,34.01000,private_job
97,187654.320988,adult,medium,1,44.86000,freelancer
98,305216.761261,youth,medium,1,28.30000,business_owner


In [22]:
categorical_features=['age_group', 'lifestyle_risk', 'occupation','city_tier']
numerical_features=['bmi','income_lpa'] 

In [None]:
##creating column transfromeer for ohe
preprocessor=ColumnTransformer(
    transformers=[("cat",OneHotEncoder(),categorical_features),("num","passthrough",numerical_features)]
)

In [24]:
# creating a popeling woth preprocessing and random forest classifier
pipeline=Pipeline(steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier(random_state=42))])

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train,y_train)

In [27]:
y_pred=pipeline.predict(X_test)
accuracy_score(y_test,y_pred)

0.9

In [29]:
import pickle
pickle_model_path="modell.pkl"
with open(pickle_model_path,'wb') as f:
    pickle.dump(pipeline,f)