In [1]:
import numpy as np
import pandas as pd

In [2]:
df_original = pd.read_csv('insurance.csv')
df = df_original.copy()
df.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92,False,Jaipur,retired,High
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium
4,69,62.2,1.6,3.94,True,Indore,retired,High


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         100 non-null    int64  
 1   weight                      100 non-null    float64
 2   height                      100 non-null    float64
 3   income_lpa                  100 non-null    float64
 4   smoker                      100 non-null    bool   
 5   city                        100 non-null    object 
 6   occupation                  100 non-null    object 
 7   insurance_premium_category  100 non-null    object 
dtypes: bool(1), float64(3), int64(1), object(3)
memory usage: 5.7+ KB


In [4]:
df.isnull().sum()

age                           0
weight                        0
height                        0
income_lpa                    0
smoker                        0
city                          0
occupation                    0
insurance_premium_category    0
dtype: int64

In [5]:
df['bmi'] = df['weight']/((df['height'])**2)
df.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,49.227482
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,30.189017
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,21.118382
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium,45.5359
4,69,62.2,1.6,3.94,True,Indore,retired,High,24.296875


In [6]:
def age_gre(age):
    if age < 25:
        return "young"
    elif age >=25 and age < 45:
        return "adult"
    elif age >= 45 and age < 60:
        return "middle_aged"
    else:
        return "senior"
    
df['age_group'] = df['age'].apply(age_gre)
df.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,49.227482,senior
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,30.189017,adult
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,21.118382,adult
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium,45.5359,young
4,69,62.2,1.6,3.94,True,Indore,retired,High,24.296875,senior


In [7]:
def risk(bmi , smoke):
    if smoke and bmi > 30:
        return 3
    elif (smoke and bmi > 27) or (not smoke and bmi > 27):
        return 2
    else:
        return 1
    
risk_vectorized = np.vectorize(risk)
df['risk'] = risk_vectorized(df['bmi'], df['smoker'])
df.head()  

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,risk
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,49.227482,senior,2
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,30.189017,adult,2
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,21.118382,adult,1
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium,45.5359,young,3
4,69,62.2,1.6,3.94,True,Indore,retired,High,24.296875,senior,1


In [8]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri", "Jaipur"
]

In [9]:
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

df["city_tier"] = df["city"].apply(city_tier)
df.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,risk,city_tier
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,49.227482,senior,2,2
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,30.189017,adult,2,1
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,21.118382,adult,1,2
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium,45.5359,young,3,1
4,69,62.2,1.6,3.94,True,Indore,retired,High,24.296875,senior,1,2


In [10]:
cols_to_drop = ['age' , 'weight' , 'height' , 'smoker' , 'city']
df.drop(cols_to_drop , axis = 1 , inplace = True)

In [11]:
df.head()

Unnamed: 0,income_lpa,occupation,insurance_premium_category,bmi,age_group,risk,city_tier
0,2.92,retired,High,49.227482,senior,2,2
1,34.28,freelancer,Low,30.189017,adult,2,1
2,36.64,freelancer,Low,21.118382,adult,1,2
3,3.34,student,Medium,45.5359,young,3,1
4,3.94,retired,High,24.296875,senior,1,2


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   income_lpa                  100 non-null    float64
 1   occupation                  100 non-null    object 
 2   insurance_premium_category  100 non-null    object 
 3   bmi                         100 non-null    float64
 4   age_group                   100 non-null    object 
 5   risk                        100 non-null    int64  
 6   city_tier                   100 non-null    int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 5.6+ KB


In [13]:
premium_mapper = {"Low" : 1 , "Medium" : 2  , "High" : 3}
df['insurance_premium_category'] = df['insurance_premium_category'].map(premium_mapper)
df.head()

Unnamed: 0,income_lpa,occupation,insurance_premium_category,bmi,age_group,risk,city_tier
0,2.92,retired,3,49.227482,senior,2,2
1,34.28,freelancer,1,30.189017,adult,2,1
2,36.64,freelancer,1,21.118382,adult,1,2
3,3.34,student,2,45.5359,young,3,1
4,3.94,retired,3,24.296875,senior,1,2


In [14]:
str_cols = ['occupation' , 'age_group']
numeric_cols = df.drop(str_cols , axis = 1).columns

numeric_df = df.drop(str_cols , axis = 1)
str_df = df.drop(numeric_cols , axis = 1)

str_cols_encoded = pd.get_dummies(str_df).astype('int')

df_new = pd.concat([numeric_df , str_cols_encoded] , axis = 1)
df_new.head()

Unnamed: 0,income_lpa,insurance_premium_category,bmi,risk,city_tier,occupation_business_owner,occupation_freelancer,occupation_government_job,occupation_private_job,occupation_retired,occupation_student,occupation_unemployed,age_group_adult,age_group_middle_aged,age_group_senior,age_group_young
0,2.92,3,49.227482,2,2,0,0,0,0,1,0,0,0,0,1,0
1,34.28,1,30.189017,2,1,0,1,0,0,0,0,0,1,0,0,0
2,36.64,1,21.118382,1,2,0,1,0,0,0,0,0,1,0,0,0
3,3.34,2,45.5359,3,1,0,0,0,0,0,1,0,0,0,0,1
4,3.94,3,24.296875,1,2,0,0,0,0,1,0,0,0,0,1,0


In [15]:
df = df_new.copy()

In [16]:
X = df.drop("insurance_premium_category" , axis = 1)
y = df['insurance_premium_category']

In [17]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.3 , random_state = 42)


In [18]:
from sklearn.preprocessing import StandardScaler
cols_to_scale = ['bmi', 'income_lpa'] 
sc = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled[cols_to_scale] = sc.fit_transform(X_train[cols_to_scale])
X_test_scaled = X_test.copy()
X_test_scaled[cols_to_scale] = sc.transform(X_test[cols_to_scale])

In [19]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 100 , criterion = 'gini')
model.fit(X_train , y_train)
pred = model.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test , pred)) 

              precision    recall  f1-score   support

           1       0.62      1.00      0.76         8
           2       0.50      0.44      0.47         9
           3       1.00      0.69      0.82        13

    accuracy                           0.70        30
   macro avg       0.71      0.71      0.68        30
weighted avg       0.75      0.70      0.70        30



In [21]:
df_original['city'].value_counts

<bound method IndexOpsMixin.value_counts of 0        Jaipur
1       Chennai
2        Indore
3        Mumbai
4        Indore
        ...    
95       Indore
96        Delhi
97    Hyderabad
98      Kolkata
99    Bangalore
Name: city, Length: 100, dtype: object>