In [31]:
# Import all the required libraries to create a AdaBoost Regressor Model

import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import r2_score

In [32]:
# Import data set
Data = pd.read_csv(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\ML_Regression_Assignment_Insurance_Data\DataSet\insurance_pre.csv")

In [33]:
Data.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


## From above input we can see that there are two category columns and four numerical columns

In [34]:
Data.shape

(1338, 6)

## From above input we can see that there are 1338 rows and 6 columns


In [35]:
# Check if there is any NA values in the data set
Data[Data.isna().any(axis=1)]

Unnamed: 0,age,sex,bmi,children,smoker,charges


In [36]:
Data[Data.isnull().any(axis=1)]

Unnamed: 0,age,sex,bmi,children,smoker,charges


## From above two commands we can see there is no any NA/Null values in the dataset

In [37]:
# Lets convert the Nominal categorical columns to numeric columns via one-hot encoding using get_dummies method using pandas

Data = pd.get_dummies(Data,drop_first=True)

In [38]:
Data.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.9,0,16884.924,0,1
1,18,33.77,1,1725.5523,1,0
2,28,33.0,3,4449.462,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.88,0,3866.8552,1,0


In [39]:
# Re-arrange the columns for better visuslaization 

Data = Data.loc[:,['age','bmi','children','sex_male','smoker_yes','charges']]

In [40]:
Data.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,charges
0,19,27.9,0,0,1,16884.924
1,18,33.77,1,1,0,1725.5523
2,28,33.0,3,1,0,4449.462
3,33,22.705,0,1,0,21984.47061
4,32,28.88,0,1,0,3866.8552


In [41]:
# Lets split data into dependent/output and independent/Input Variables

dependent = Data[['charges']]
independent = Data[['age','bmi','children','sex_male','smoker_yes']]

In [42]:
dependent.head()

Unnamed: 0,charges
0,16884.924
1,1725.5523
2,4449.462
3,21984.47061
4,3866.8552


In [43]:
independent.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.9,0,0,1
1,18,33.77,1,1,0
2,28,33.0,3,1,0
3,33,22.705,0,1,0
4,32,28.88,0,1,0


In [44]:
# Convert the all the columns values into same scale using standardization

std = StandardScaler()
independent = std.fit_transform(independent)

In [45]:
independent

array([[-1.43876426, -0.45332   , -0.90861367, -1.0105187 ,  1.97058663],
       [-1.50996545,  0.5096211 , -0.07876719,  0.98959079, -0.5074631 ],
       [-0.79795355,  0.38330685,  1.58092576,  0.98959079, -0.5074631 ],
       ...,
       [-1.50996545,  1.0148781 , -0.90861367, -1.0105187 , -0.5074631 ],
       [-1.29636188, -0.79781341, -0.90861367, -1.0105187 , -0.5074631 ],
       [ 1.55168573, -0.26138796, -0.90861367, -1.0105187 ,  1.97058663]])

In [46]:
# converting independent back to data frame for visualizing the data

independent = pd.DataFrame(independent,columns=['age','bmi','children','sex_male','smoker_yes']) 

In [47]:
independent.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,-1.438764,-0.45332,-0.908614,-1.010519,1.970587
1,-1.509965,0.509621,-0.078767,0.989591,-0.507463
2,-0.797954,0.383307,1.580926,0.989591,-0.507463
3,-0.441948,-1.305531,-0.908614,0.989591,-0.507463
4,-0.513149,-0.292556,-0.908614,0.989591,-0.507463


In [48]:
# Lets divide the training and test data for creating and testing the model

X_Train,X_Test,Y_Train,Y_Test = train_test_split(independent,dependent,test_size=0.30,random_state=0)

In [49]:
X_Train.shape

(936, 5)

In [50]:
X_Test.shape

(402, 5)

In [51]:
# Lets create the model using the training data

AdaBoost_Regressor_Model = AdaBoostRegressor()
AdaBoost_Regressor_Model.fit(X_Train,Y_Train)

  y = column_or_1d(y, warn=True)


In [52]:
# Test the model using our test data

predicted_y = AdaBoost_Regressor_Model.predict(X_Test)

In [53]:
# Lets validate the R_score value

R2_Score = r2_score(Y_Test,predicted_y)

In [54]:
R2_Score

0.8657785711547105

In [67]:
# Lets import and create instance of all the linear models and we can use those models as base estimator for boosting the accuracy
L = LinearRegression()
D = DecisionTreeRegressor()
S = SVR()

## Lets try to create the evaluation metric grid/table with different parameter options

In [75]:
Columns = ['S.NO','estimator','n_estimators','loss','R2_Value']
#estimator = ['LinearRegression', 'DecisionTreeRegressor', 'SVM']
estimator = [L,D,S]
n_estimators = [i for i in range(50,1500,50)]
loss=['linear', 'square', 'exponential']

In [76]:
import warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=DataConversionWarning)
Final_Series =[]
S_No = 1
Greater_R2_Score = 0
High_Performance_Comb = []
Final_Series =[]
for e in estimator:
    for n in n_estimators:
        for l in loss:
            dummy_Series=[]
            dummy_Series.append(S_No)
            dummy_Series.append(e)
            dummy_Series.append(n)
            dummy_Series.append(l)
            Model = AdaBoostRegressor(base_estimator=e,n_estimators=n,loss=l)
            Model.fit(X_Train,Y_Train)
            Predicted_Y = Model.predict(X_Test)
            R2_score = r2_score(Y_Test,Predicted_Y)
            dummy_Series.append(R2_score)
            if R2_score > Greater_R2_Score:
                R2_Series = []
                Greater_R2_Score=R2_score
                High_Performance_Col =['S.NO','estimator',"splitter",'max_features','R2_score']
                R2_Series.append(S_No)
                R2_Series.append(e)
                R2_Series.append(n)
                R2_Series.append(l)
                R2_Series.append(Greater_R2_Score)
                High_Performance_Comb=np.array(R2_Series)
                #print(High_Performance_Comb)
            else:
                pass
            Final_Series.append(dummy_Series)
            S_No = S_No+1
Final_Series
Model_Evaluation = pd.DataFrame(np.array(Final_Series),columns=Columns)
Model_Evaluation = Model_Evaluation.astype({'S.NO':'int'})
Model_Evaluation
High_Performance = pd.DataFrame([High_Performance_Comb],columns=High_Performance_Col)
High_Performance = High_Performance.astype({'S.NO':'int'})
Model_Evaluation

Unnamed: 0,S.NO,estimator,n_estimators,loss,R2_Value
0,1,LinearRegression(),50,linear,0.769919
1,2,LinearRegression(),50,square,0.765077
2,3,LinearRegression(),50,exponential,0.704444
3,4,LinearRegression(),100,linear,0.783143
4,5,LinearRegression(),100,square,0.757786
...,...,...,...,...,...
256,257,SVR(),1400,square,-0.071405
257,258,SVR(),1400,exponential,-0.153986
258,259,SVR(),1450,linear,0.003413
259,260,SVR(),1450,square,-0.000477


In [90]:
# this combination from the above evaluation rsults has the greatest R2_Score
High_Performance

Unnamed: 0,S.NO,estimator,n_estimators,loss,R2_score
0,99,DecisionTreeRegressor(),200,exponential,0.854499


In [79]:
Model_Evaluation

Unnamed: 0,S.NO,estimator,n_estimators,loss,R2_Value
0,1,LinearRegression(),50,linear,0.769919
1,2,LinearRegression(),50,square,0.765077
2,3,LinearRegression(),50,exponential,0.704444
3,4,LinearRegression(),100,linear,0.783143
4,5,LinearRegression(),100,square,0.757786
...,...,...,...,...,...
256,257,SVR(),1400,square,-0.071405
257,258,SVR(),1400,exponential,-0.153986
258,259,SVR(),1450,linear,0.003413
259,260,SVR(),1450,square,-0.000477


In [78]:
# As the Model_Evaluation from abovve output has lot of rows so we are saving the dataframe to csv file for verifying all the combinations 
Model_Evaluation.to_csv(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\ML_Regression_Assignment_Insurance_Data\AdaBoost_Evaluation.csv")

In [144]:
AdaBoost_Regressor_Model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),n_estimators=200,loss='exponential',random_state=50)
AdaBoost_Regressor_Model.fit(X_Train,Y_Train)

In [145]:
Predicted_Y = AdaBoost_Regressor_Model.predict(X_Test)
R2_score = r2_score(Y_Test,Predicted_Y)

In [146]:
R2_score

0.8545024319300922

In [147]:
# Save the model using pickle
pickle.dump(AdaBoost_Regressor_Model,open(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\ML_Regression_Assignment_Insurance_Data\Final Models\AdaBoost_Regressor_Model.sav","wb"))