In [2]:
# Import all the required libraries to create a Decision Tree Regressor Model

import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [3]:
# Import data set
Data = pd.read_csv(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\ML_Regression_Assignment_Insurance_Data\DataSet\insurance_pre.csv")

In [4]:
Data.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


## From above input we can see that there are two category columns and four numerical columns

In [5]:
Data.shape

(1338, 6)

## From above input we can see that there are 1338 rows and 6 columns


In [6]:
# Check if there is any NA values in the data set
Data[Data.isna().any(axis=1)]

Unnamed: 0,age,sex,bmi,children,smoker,charges


In [7]:
Data[Data.isnull().any(axis=1)]

Unnamed: 0,age,sex,bmi,children,smoker,charges


## From above two commands we can see there is no any NA/Null values in the dataset

In [8]:
# Lets convert the Nominal categorical columns to numeric columns via one-hot encoding using get_dummies method using pandas

Data = pd.get_dummies(Data,drop_first=True)

In [9]:
Data.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.9,0,16884.924,0,1
1,18,33.77,1,1725.5523,1,0
2,28,33.0,3,4449.462,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.88,0,3866.8552,1,0


In [10]:
# Re-arrange the columns for better visuslaization 

Data = Data.loc[:,['age','bmi','children','sex_male','smoker_yes','charges']]

In [11]:
Data.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,charges
0,19,27.9,0,0,1,16884.924
1,18,33.77,1,1,0,1725.5523
2,28,33.0,3,1,0,4449.462
3,33,22.705,0,1,0,21984.47061
4,32,28.88,0,1,0,3866.8552


In [12]:
# Lets split data into dependent/output and independent/Input Variables

dependent = Data[['charges']]
independent = Data[['age','bmi','children','sex_male','smoker_yes']]

In [13]:
dependent.head()

Unnamed: 0,charges
0,16884.924
1,1725.5523
2,4449.462
3,21984.47061
4,3866.8552


In [14]:
independent.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.9,0,0,1
1,18,33.77,1,1,0
2,28,33.0,3,1,0
3,33,22.705,0,1,0
4,32,28.88,0,1,0


In [15]:
# Convert the all the columns values into same scale using standardization

std = StandardScaler()
independent = std.fit_transform(independent)

In [16]:
independent

array([[-1.43876426, -0.45332   , -0.90861367, -1.0105187 ,  1.97058663],
       [-1.50996545,  0.5096211 , -0.07876719,  0.98959079, -0.5074631 ],
       [-0.79795355,  0.38330685,  1.58092576,  0.98959079, -0.5074631 ],
       ...,
       [-1.50996545,  1.0148781 , -0.90861367, -1.0105187 , -0.5074631 ],
       [-1.29636188, -0.79781341, -0.90861367, -1.0105187 , -0.5074631 ],
       [ 1.55168573, -0.26138796, -0.90861367, -1.0105187 ,  1.97058663]])

In [17]:
# converting independent back to data frame for visualizing the data

independent = pd.DataFrame(independent,columns=['age','bmi','children','sex_male','smoker_yes']) 

In [18]:
independent.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,-1.438764,-0.45332,-0.908614,-1.010519,1.970587
1,-1.509965,0.509621,-0.078767,0.989591,-0.507463
2,-0.797954,0.383307,1.580926,0.989591,-0.507463
3,-0.441948,-1.305531,-0.908614,0.989591,-0.507463
4,-0.513149,-0.292556,-0.908614,0.989591,-0.507463


In [19]:
# Lets divide the training and test data for creating and testing the model

X_Train,X_Test,Y_Train,Y_Test = train_test_split(independent,dependent,test_size=0.30,random_state=0)

In [20]:
X_Train.shape

(936, 5)

In [21]:
X_Test.shape

(402, 5)

In [22]:
# Lets create the model using the training data

RandomForest_Regressor_Model = RandomForestRegressor()
RandomForest_Regressor_Model.fit(X_Train,Y_Train)

  RandomForest_Regressor_Model.fit(X_Train,Y_Train)


In [23]:
# Test the model using our test data

predicted_y = RandomForest_Regressor_Model.predict(X_Test)

In [24]:
# Lets validate the R_score value

R2_Score = r2_score(Y_Test,predicted_y)

In [25]:
R2_Score

0.8556556063962015

## Lets try to create the evaluation metric grid/table with different parameter options

In [26]:
Columns = ['S.NO','criterion','n_estimators','max_features','R2_Value']
criterion = ['squared_error', 'friedman_mse', 'absolute_error']
n_estimators = [100,500,1000]
max_features=['auto','sqrt','log2']

In [28]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
# from sklearn.exceptions import DataConversionWarning
# warnings.filterwarnings("ignore", category=DataConversionWarning)
Final_Series =[]
S_No = 1
Greater_R2_Score = 0
High_Performance_Comb = []
Final_Series =[]
for c in criterion:
    for n in n_estimators:
        for m in max_features:
            dummy_Series=[]
            dummy_Series.append(S_No)
            dummy_Series.append(c)
            dummy_Series.append(n)
            dummy_Series.append(m)
            RandomForest_Regressor_Model = RandomForestRegressor(criterion=c,n_estimators=n,max_features=m)
            RandomForest_Regressor_Model.fit(X_Train,Y_Train)
            Predicted_Y = RandomForest_Regressor_Model.predict(X_Test)
            R2_score = r2_score(Y_Test,Predicted_Y)
            dummy_Series.append(R2_score)
            if R2_score > Greater_R2_Score:
                R2_Series = []
                Greater_R2_Score=R2_score
                High_Performance_Col =['S.NO','criterion',"splitter",'max_features','R2_score']
                R2_Series.append(S_No)
                R2_Series.append(c)
                R2_Series.append(n)
                R2_Series.append(m)
                R2_Series.append(Greater_R2_Score)
                High_Performance_Comb=np.array(R2_Series)
                #print(High_Performance_Comb)
            else:
                pass
            Final_Series.append(dummy_Series)
            S_No = S_No+1
Final_Series
Model_Evaluation = pd.DataFrame(np.array(Final_Series),columns=Columns)
Model_Evaluation = Model_Evaluation.astype({'S.NO':'int'})
Model_Evaluation
High_Performance = pd.DataFrame([High_Performance_Comb],columns=High_Performance_Col)
High_Performance = High_Performance.astype({'S.NO':'int'})
Model_Evaluation

  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomForest_Regressor_Model.fit(X_Train,Y_Train)
  RandomFore

Unnamed: 0,S.NO,criterion,n_estimators,max_features,R2_Value
0,1,squared_error,100,auto,0.851720798251022
1,2,squared_error,100,sqrt,0.8710604257002563
2,3,squared_error,100,log2,0.8726118782212714
3,4,squared_error,500,auto,0.8561810191197293
4,5,squared_error,500,sqrt,0.872232416041898
5,6,squared_error,500,log2,0.872706207767189
6,7,squared_error,1000,auto,0.8582726528883846
7,8,squared_error,1000,sqrt,0.8735532881013544
8,9,squared_error,1000,log2,0.8731537298708718
9,10,friedman_mse,100,auto,0.8565822313598334


In [29]:
High_Performance

Unnamed: 0,S.NO,criterion,splitter,max_features,R2_score
0,26,absolute_error,1000,sqrt,0.8754926480059106


In [30]:
Model_Evaluation

Unnamed: 0,S.NO,criterion,n_estimators,max_features,R2_Value
0,1,squared_error,100,auto,0.851720798251022
1,2,squared_error,100,sqrt,0.8710604257002563
2,3,squared_error,100,log2,0.8726118782212714
3,4,squared_error,500,auto,0.8561810191197293
4,5,squared_error,500,sqrt,0.872232416041898
5,6,squared_error,500,log2,0.872706207767189
6,7,squared_error,1000,auto,0.8582726528883846
7,8,squared_error,1000,sqrt,0.8735532881013544
8,9,squared_error,1000,log2,0.8731537298708718
9,10,friedman_mse,100,auto,0.8565822313598334


In [31]:
RandomForest_Regressor_Model = RandomForestRegressor(criterion='absolute_error',n_estimators=1000,max_features='sqrt')
RandomForest_Regressor_Model.fit(X_Train,Y_Train)

  RandomForest_Regressor_Model.fit(X_Train,Y_Train)


In [33]:
Predicted_Y = RandomForest_Regressor_Model.predict(X_Test)
R2_score = r2_score(Y_Test,Predicted_Y)

In [34]:
R2_score

0.875186419319372

In [32]:
# Save the model using pickle
pickle.dump(RandomForest_Regressor_Model,open(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\ML_Regression_Assignment_Insurance_Data\Final Models\RandomForest_Regressor_Model_Final.sav","wb"))