In [49]:
import pandas as pd 
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

In [50]:
df = pd.read_csv('insurance.csv')

In [51]:
df.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92,False,Jaipur,retired,High
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium
4,69,62.2,1.6,3.94,True,Indore,retired,High


In [52]:
df_feat = df.copy() # make the copy of the data set so we can add the bmi and the other things 


In [53]:
# calculating the bmi from the height and the weight
df_feat['bmi'] = df_feat['weight']/(df_feat['height']**2)

In [54]:
# now we will convert the age into the age group
def age_group(age: int):
  if age < 25:
    return "young"
  elif age < 45:
    return "adult"
  elif age < 60:
    return "middle-age"
  else:
    return "senior"

In [55]:
# now we will make this column
df_feat["age_group"] = df_feat["age"].apply(age_group)

In [56]:
# now we will make the feature 3
def life_style(row):
  if row['smoker'] and row['bmi'] > 30:
    return "high"
  elif row["smoker"] or row['bmi'] > 27:
    return 'medium'
  else:
    return 'low'

In [57]:
df_feat['lifestyle_risk'] = df_feat.apply(life_style, axis = 1)



In [58]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]

tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [59]:
def city_tier(city):
  if city in tier_1_cities:
    return 1
  elif city in tier_2_cities:
    return 2
  else:
    return 3

In [60]:
df_feat["city_tier"] = df_feat['city'].apply(city_tier)

In [61]:
df_feat = df_feat.drop(columns= ['age', 'weight', 'height', 'smoker', 'city'])

In [62]:
x = df_feat.drop(columns=['insurance_premium_category'])
y = df_feat[['insurance_premium_category']]

In [63]:
x

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier
0,2.92000,retired,49.227482,senior,medium,2
1,34.28000,freelancer,30.189017,adult,medium,1
2,36.64000,freelancer,21.118382,adult,low,2
3,3.34000,student,45.535900,young,high,1
4,3.94000,retired,24.296875,senior,medium,2
...,...,...,...,...,...,...
95,19.64000,business_owner,21.420747,adult,low,2
96,34.01000,private_job,47.984483,adult,medium,1
97,44.86000,freelancer,18.765432,middle-age,low,1
98,28.30000,business_owner,30.521676,adult,medium,1


In [64]:
y

Unnamed: 0,insurance_premium_category
0,High
1,Low
2,Low
3,Medium
4,High
...,...
95,Low
96,Low
97,Low
98,Low


In [65]:
categorical = ['age_group', 'lifestyle_risk', 'occupation', 'city_tier']
numerical = ['bmi', 'income_lpa']

In [66]:
preprocessor = ColumnTransformer(
  transformers=[
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
    ("num", "passthrough", numerical)
  ]
)


In [67]:
pipeline = Pipeline(steps=[
  ("preprocessor", preprocessor),
  ("classifier", RandomForestClassifier(random_state=42))
])

In [68]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)

In [69]:
pipeline.fit(x_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [70]:
y_pred = pipeline.predict(x_test)
accuracy_score(y_test, y_pred)

0.9

In [71]:
import pickle

In [72]:
path = "model.pkl"
with open(path, "wb") as f:
  pickle.dump(pipeline, f)