In [21]:
url = r"C:\Users\princ\OneDrive\Documents\Chapter 1\Python\Machine Learning\ML Data\insurance.csv"

Importing Dependencies

In [47]:
import numpy as np
import pandas as pd 
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

In [48]:
#Converting the data to a pandas data frame
data = pd.read_csv(url)

In [49]:
#Printing first five rows of the data
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [50]:
#Checking for missing values
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [51]:
data['region'].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [52]:
#Label Encoding 
data.replace(to_replace = ['southwest','southeast','northeast','northwest'],value = [0,1,2,3],inplace = True)
data.replace(to_replace = ['yes','no'],value = [0,1],inplace = True)
data.replace(to_replace = ['male','female'],value = [0,1],inplace = True)

  data.replace(to_replace = ['southwest','southeast','northeast','northwest'],value = [0,1,2,3],inplace = True)
  data.replace(to_replace = ['yes','no'],value = [0,1],inplace = True)
  data.replace(to_replace = ['male','female'],value = [0,1],inplace = True)


In [53]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,0,0,16884.924
1,18,0,33.77,1,1,1,1725.5523
2,28,0,33.0,3,1,1,4449.462
3,33,0,22.705,0,1,3,21984.47061
4,32,0,28.88,0,1,3,3866.8552


In [54]:
#Checking correlation between the data
data.corr()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
age,1.0,0.020856,0.109272,0.042469,0.025019,-0.003243,0.299008
sex,0.020856,1.0,-0.046371,-0.017163,0.076185,0.007974,-0.057292
bmi,0.109272,-0.046371,1.0,0.012759,-0.00375,-0.156686,0.198341
children,0.042469,-0.017163,0.012759,1.0,-0.007673,0.001907,0.067998
smoker,0.025019,0.076185,-0.00375,-0.007673,1.0,0.013246,-0.787251
region,-0.003243,0.007974,-0.156686,0.001907,0.013246,1.0,-0.011741
charges,0.299008,-0.057292,0.198341,0.067998,-0.787251,-0.011741,1.0


As we can see the charges has more correlation with the age , bmi , smoker so we can neglect the other ones

In [80]:
#Splitting the dataset into features and target
X = data.drop(columns = ['charges'],axis = 1)
y = data['charges']

In [81]:
X = np.asarray(X)
Y = np.asarray(y)

In [82]:
#Importing the models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [83]:
models = [LinearRegression(),DecisionTreeRegressor(max_depth = 5),RandomForestRegressor(n_estimators = 100,max_depth =5 ),xgb.XGBRegressor(n_estimators = 100,learning_rate = 0.01,max_depth = 5)]

In [84]:
i=0
for model in models:
    model.fit(X,Y)
    print('for ',model)
    scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    print(f"Average mean R2 score is: {round(scores.mean(),3)}")
    print("----------------------------------------------------------")
    

for  LinearRegression()
Average mean R2 score is: 0.747
----------------------------------------------------------
for  DecisionTreeRegressor(max_depth=5)
Average mean R2 score is: 0.842
----------------------------------------------------------
for  RandomForestRegressor(max_depth=5)
Average mean R2 score is: 0.858
----------------------------------------------------------
for  XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.01, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=5,
             max_leaves=None, min_child_weight=None, missing=nan,
      

As we can see the best model in these are Random Forest Regressor with R2 score of 0.858

In [85]:
#Now printing the prediction values of the RandomForestRegressor
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2,random_state = 56)
Predictions = models[2].predict(X_test)
print(Predictions[:5])

[ 5068.94611517  2753.75840625  4623.04772093  7611.46911247
 39435.08434257]


In [92]:
#Making a Predictive system
input_data = (19,1,27.900,0,0,0)
#Changing the data to numpy array
input_data_as_numpy_array = np.asarray(input_data)
#Reshaping the array as we are predicting the one instance
reshaped_input_data = input_data_as_numpy_array.reshape(1,-1)
prediction = models[2].predict(reshaped_input_data)
print(prediction)

[17597.08345311]


Saving The Trained Model

In [93]:
import pickle

In [94]:
filename = 'Trained_model.sav'
pickle.dump(models[2],open(filename,'wb'))

In [95]:
#loading the saved model
loaded_model = pickle.load(open('Trained_model.sav','rb'))

In [96]:
 #Making a Predictive system
input_data = (19,1,27.900,0,0,0)
#Changing the data to numpy array
input_data_as_numpy_array = np.asarray(input_data)
#Reshaping the array as we are predicting the one instance
reshaped_input_data = input_data_as_numpy_array.reshape(1,-1)
prediction = loaded_model.predict(reshaped_input_data)
print(prediction)

[17597.08345311]
