In [1]:
import pandas as pd
df=pd.read_csv('insurance.csv')

In [2]:
df.head(10)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92,False,Jaipur,retired,High
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium
4,69,62.2,1.6,3.94,True,Indore,retired,High
5,53,62.9,1.66,50.0,False,Kota,freelancer,Medium
6,19,80.1,1.68,3.59,True,Hyderabad,student,Medium
7,31,105.7,1.78,10.865821,True,Delhi,government_job,Medium
8,73,58.0,1.58,1.78,False,Chandigarh,retired,Medium
9,58,74.4,1.73,43.07,False,Pune,business_owner,Low


In [3]:
def bmi(height,weight):
    return round((weight)/(height**2),2)

In [4]:
bmi(1.75,44)

14.37

In [5]:
def age_categorical(age):
    if 0<=age<18:
        return "Under-Aged"
    elif 18<=age<=45:
        return "Middle-Aged"
    else:
        return "Over-Aged"
    return None

In [6]:
df.city.unique()

array(['Jaipur', 'Chennai', 'Indore', 'Mumbai', 'Kota', 'Hyderabad',
       'Delhi', 'Chandigarh', 'Pune', 'Kolkata', 'Lucknow', 'Gaya',
       'Jalandhar', 'Mysore', 'Bangalore'], dtype=object)

In [7]:
df['bmi']=df.apply(lambda x:bmi(x['height'],x['weight']),axis=1)

In [8]:
df['age']=df['age'].apply(age_categorical)

In [9]:
def lifestyle_risk(row):
    if row['smoker'] and row['bmi']>30:
        return 'high'
    elif row['smoker'] or row['bmi']>27:
        return "medium"
    else:
        return "low"

In [10]:
df['lifestyle_risk']=df.apply(lifestyle_risk,axis=1)

In [11]:
tier_1_cities=["Mumbai","Delhi","Bangalore","Chennai","Kolkota","Hyderabad","Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]


In [12]:
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3


In [13]:
df['city_tier']=df['city'].apply(city_tier)

In [14]:
df.drop(columns=['weight','height','smoker','city'],inplace=True)

In [15]:
df.columns

Index(['age', 'income_lpa', 'occupation', 'insurance_premium_category', 'bmi',
       'lifestyle_risk', 'city_tier'],
      dtype='object')

In [16]:
X=df[['age','income_lpa', 'occupation', 'bmi',
       'lifestyle_risk', 'city_tier']]
y=df['insurance_premium_category']

In [17]:
X,y

(            age  income_lpa      occupation    bmi lifestyle_risk  city_tier
 0     Over-Aged     2.92000         retired  49.23         medium          2
 1   Middle-Aged    34.28000      freelancer  30.19         medium          1
 2   Middle-Aged    36.64000      freelancer  21.12            low          2
 3   Middle-Aged     3.34000         student  45.54           high          1
 4     Over-Aged     3.94000         retired  24.30         medium          2
 ..          ...         ...             ...    ...            ...        ...
 95  Middle-Aged    19.64000  business_owner  21.42            low          2
 96  Middle-Aged    34.01000     private_job  47.98         medium          1
 97    Over-Aged    44.86000      freelancer  18.77            low          1
 98  Middle-Aged    28.30000  business_owner  30.52         medium          3
 99  Middle-Aged    28.16664  government_job  27.69         medium          1
 
 [100 rows x 6 columns],
 0       High
 1        Low
 2       

In [18]:
categorical_features = ["age", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [20]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [21]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)


In [23]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.75

In [24]:
import pickle


pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)