<h1>Medical Insurance Price Predictor</h1>

***Importing the necessary libraries***

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)

***Loading the dataset***

In [2]:
df1 = pd.read_csv("insurance.csv")
df1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


***For knowing different types of elements in "region"***

In [3]:
df1.groupby('region')['region'].agg('count')

region
northeast    324
northwest    325
southeast    364
southwest    325
Name: region, dtype: int64

***For knowing different types of elements in "children"***

In [4]:
df1.groupby('children')['children'].agg('count')

children
0    574
1    324
2    240
3    157
4     25
5     18
Name: children, dtype: int64

***For knowing different types of elements in "sex"***

In [5]:
df1.groupby('sex')['sex'].agg('count')

sex
female    662
male      676
Name: sex, dtype: int64

***Now, to know if there are any null values, we use the code given below***

In [6]:
df1.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

***Let's replace the variable datatypes to numbers in the given dataset***

In [7]:
df1['sex'] = df1['sex'].replace({'male': 0, 'female': 1})
df1['smoker'] = df1['smoker'].replace({'no': 0, 'yes': 1})
df1['region'] = df1['region'].replace({'northeast': 0, 'northwest': 1, 'southeast': 2, 'southwest': 3})

df2 = df1.copy()

df2

  df1['sex'] = df1['sex'].replace({'male': 0, 'female': 1})
  df1['smoker'] = df1['smoker'].replace({'no': 0, 'yes': 1})
  df1['region'] = df1['region'].replace({'northeast': 0, 'northwest': 1, 'southeast': 2, 'southwest': 3})


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.900,0,1,3,16884.92400
1,18,0,33.770,1,0,2,1725.55230
2,28,0,33.000,3,0,2,4449.46200
3,33,0,22.705,0,0,1,21984.47061
4,32,0,28.880,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,1,10600.54830
1334,18,1,31.920,0,0,0,2205.98080
1335,18,1,36.850,0,0,2,1629.83350
1336,21,1,25.800,0,0,3,2007.94500


***Model Building***

In [8]:
X = df2.drop(['charges'],axis='columns')
y = df2.charges

In [9]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,1,27.900,0,1,3
1,18,0,33.770,1,0,2
2,28,0,33.000,3,0,2
3,33,0,22.705,0,0,1
4,32,0,28.880,0,0,1
...,...,...,...,...,...,...
1333,50,0,30.970,3,0,1
1334,18,1,31.920,0,0,0
1335,18,1,36.850,0,0,2
1336,21,1,25.800,0,0,3


In [10]:
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [12]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)

lr_clf.score(X_test,y_test)

0.7445422986536502

***Using K Fold Cross Validation for measuring accuracy of our model***

In [13]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.79987471, 0.74792712, 0.71092717, 0.77433601, 0.80803446])

***Finding best model using GridSearchCV***

In [14]:
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
import xgboost as xgb
from catboost import CatBoostRegressor
import pandas as pd

def find_best_model_using_gridsearchcv(X, y):
    algos = {
        'linear_regression': {
            'model': LinearRegression(),
            'params': {
                'fit_intercept': [True, False],
                'copy_X': [True, False],
                'n_jobs': [None, -1]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1, 2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['squared_error', 'friedman_mse'],
                'splitter': ['best', 'random']
            }
        },
        'adaboost': {
            'model': AdaBoostRegressor(),
            'params': {
                'n_estimators': [50, 100],
                'learning_rate': [0.01, 0.1, 1]
            }
        },
        'xgboost': {
            'model': xgb.XGBRegressor(objective='reg:squarederror'),
            'params': {
                'n_estimators': [50, 100],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7]
            }
        },
        'random_forest': {
            'model': RandomForestRegressor(),
            'params': {
                'n_estimators': [50, 100],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5, 10]
            }
        },
        'catboost': {
            'model': CatBoostRegressor(verbose=0),
            'params': {
                'iterations': [100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'depth': [3, 5, 7]
            }
        }
    }
    
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    
    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X, y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

find_best_model_using_gridsearchcv(X, y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.76822,"{'copy_X': True, 'fit_intercept': True, 'n_job..."
1,lasso,0.76822,"{'alpha': 1, 'selection': 'random'}"
2,decision_tree,0.733945,"{'criterion': 'squared_error', 'splitter': 'ra..."
3,adaboost,0.873003,"{'learning_rate': 0.01, 'n_estimators': 50}"
4,xgboost,0.887454,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti..."
5,random_forest,0.877178,"{'max_depth': 20, 'min_samples_split': 10, 'n_..."
6,catboost,0.885135,"{'depth': 3, 'iterations': 100, 'learning_rate..."


***The best score is given by xgboost soo we shall continue using that for training the model***

In [15]:
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

In [16]:
model.score(X_test, y_test)

0.8572735666934079

In [17]:
model.predict(X_test)

array([ 4949.773 , 12190.754 , 12797.387 ,  5004.763 ,  8587.966 ,
        8874.829 ,  4200.1885,  2155.5305, 19162.637 ,  8185.293 ,
       12855.317 ,  3653.4578, 19155.586 ,  3277.3042, 10564.428 ,
       18459.729 ,  3504.036 ,  7310.2056, 20496.643 ,  2217.8948,
       12296.921 ,  2877.323 , 40270.086 , 20136.021 , 38024.58  ,
       10320.941 ,  6569.703 ,  7652.157 ,  6155.386 ,  4296.375 ,
        7163.8306,  6617.2437,  7122.614 ,  4746.4014,  9141.639 ,
        5596.164 , 38329.875 ,  5328.818 , 17550.896 , 15015.674 ,
        1842.9272, 34630.414 ,  7849.6914,  1406.4437,  7946.988 ,
        6424.9897, 10538.447 ,  7638.087 ,  4556.4946, 11003.752 ,
        9026.608 ,  1219.3291, 17744.363 , 43636.312 ,  8922.025 ,
       12068.327 ,  4230.266 ,  8111.4414,  7782.8984, 39923.79  ,
       16124.178 ,  4382.6504,  4620.831 , 43026.426 ,  1067.0771,
       11853.085 , 11762.32  ,  5879.2666, 13855.747 ,  8872.855 ,
       11851.696 , 16305.905 , 11731.614 ,  7798.3667, 13771.7

In [18]:
df2

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.900,0,1,3,16884.92400
1,18,0,33.770,1,0,2,1725.55230
2,28,0,33.000,3,0,2,4449.46200
3,33,0,22.705,0,0,1,21984.47061
4,32,0,28.880,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,1,10600.54830
1334,18,1,31.920,0,0,0,2205.98080
1335,18,1,36.850,0,0,2,1629.83350
1336,21,1,25.800,0,0,3,2007.94500


In [19]:
model.predict([[61,1,25.84,0,0,1]])

array([18802.395], dtype=float32)

***Saving and Exporting the model to pickle file***

In [20]:
import pickle
with open('medical_insurance.pickle','wb') as f:
    pickle.dump(model,f)

***Loading the trained model***

In [21]:
loaded_model = pickle.load(open('medical_insurance.pickle', 'rb'))

***Sample Testing of loaded model***

In [22]:
loaded_model.predict([[61,1,29.070,0,1,1]])

array([29163.031], dtype=float32)