In [627]:
# Import all the required libraries to create a XGBoost Regressor Model

import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

In [628]:
# Import data set
Data = pd.read_csv(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\ML_Regression_Assignment_Insurance_Data\DataSet\insurance_pre.csv")

In [629]:
Data.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


## From above input we can see that there are two category columns and four numerical columns

In [630]:
Data.shape

(1338, 6)

## From above input we can see that there are 1338 rows and 6 columns


In [631]:
# Check if there is any NA values in the data set
Data[Data.isna().any(axis=1)]

Unnamed: 0,age,sex,bmi,children,smoker,charges


In [632]:
Data[Data.isnull().any(axis=1)]

Unnamed: 0,age,sex,bmi,children,smoker,charges


## From above two commands we can see there is no any NA/Null values in the dataset

In [633]:
# Lets convert the Nominal categorical columns to numeric columns via one-hot encoding using get_dummies method using pandas

Data = pd.get_dummies(Data,drop_first=True)

In [634]:
Data.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.9,0,16884.924,0,1
1,18,33.77,1,1725.5523,1,0
2,28,33.0,3,4449.462,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.88,0,3866.8552,1,0


In [635]:
# Re-arrange the columns for better visuslaization 

Data = Data.loc[:,['age','bmi','children','sex_male','smoker_yes','charges']]

In [636]:
Data.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,charges
0,19,27.9,0,0,1,16884.924
1,18,33.77,1,1,0,1725.5523
2,28,33.0,3,1,0,4449.462
3,33,22.705,0,1,0,21984.47061
4,32,28.88,0,1,0,3866.8552


In [637]:
# Lets split data into dependent/output and independent/Input Variables

dependent = Data[['charges']]
independent = Data[['age','bmi','children','sex_male','smoker_yes']]

In [638]:
dependent.head()

Unnamed: 0,charges
0,16884.924
1,1725.5523
2,4449.462
3,21984.47061
4,3866.8552


In [639]:
independent.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.9,0,0,1
1,18,33.77,1,1,0
2,28,33.0,3,1,0
3,33,22.705,0,1,0
4,32,28.88,0,1,0


In [640]:
# Convert the all the columns values into same scale using standardization

std = StandardScaler()
independent = std.fit_transform(independent)

In [641]:
independent

array([[-1.43876426, -0.45332   , -0.90861367, -1.0105187 ,  1.97058663],
       [-1.50996545,  0.5096211 , -0.07876719,  0.98959079, -0.5074631 ],
       [-0.79795355,  0.38330685,  1.58092576,  0.98959079, -0.5074631 ],
       ...,
       [-1.50996545,  1.0148781 , -0.90861367, -1.0105187 , -0.5074631 ],
       [-1.29636188, -0.79781341, -0.90861367, -1.0105187 , -0.5074631 ],
       [ 1.55168573, -0.26138796, -0.90861367, -1.0105187 ,  1.97058663]])

In [642]:
# converting independent back to data frame for visualizing the data

independent = pd.DataFrame(independent,columns=['age','bmi','children','sex_male','smoker_yes']) 

In [643]:
independent.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,-1.438764,-0.45332,-0.908614,-1.010519,1.970587
1,-1.509965,0.509621,-0.078767,0.989591,-0.507463
2,-0.797954,0.383307,1.580926,0.989591,-0.507463
3,-0.441948,-1.305531,-0.908614,0.989591,-0.507463
4,-0.513149,-0.292556,-0.908614,0.989591,-0.507463


In [663]:
# Lets divide the training and test data for creating and testing the model

X_Train,X_Test,Y_Train,Y_Test = train_test_split(independent,dependent,test_size=0.30,random_state=0)

In [664]:
X_Train.shape

(936, 5)

In [665]:
X_Test.shape

(402, 5)

## Objective states that what kind of problem you are going to solve like as we are going to solve the rgression problem we need to specify the which loss function that algoritm needs to use as metric to calculate the residuals(errors)
                                       (or)
## The objective determines the learning task, thus the type of the target variable. The available options include regression, logistic regression, binary and multi classification or rank. This option allows to apply XGBoost models to several different types of use cases.

## The booster parameter sets the type of learner. Usually this is either a tree or a linear function. In the case of trees, the model will consist of an ensemble of trees. For the linear booster, it will be a weighted sum of linear functions.




In [647]:
# The below are the extra parameters we can use as part of regression
gamma=1
eval_metric='mape'

In [666]:
# Lets create the model using the training data

XGBoost_Regressor_Model = XGBRegressor(objective="reg:squarederror",booster='gbtree',n_estimators=120,enable_categorical=False,learning_rate=1.8,max_depth=2,max_leaves=4,num_parallel_tree=1)
XGBoost_Regressor_Model.fit(X_Train,Y_Train)

In [667]:
# Test the model using our test data

predicted_y = XGBoost_Regressor_Model.predict(X_Test)

In [668]:
# Lets validate the R_score value

R2_Score = r2_score(Y_Test,predicted_y)

In [669]:
R2_Score

0.8286389171545191

## Lets try to create the evaluation metric grid/table with different parameter options

In [652]:
Columns = ['S.NO','objective','n_estimators','booster','learning_rate','R2_Value']
#estimator = ['LinearRegression', 'DecisionTreeRegressor', 'SVM']
objective = ['reg:squarederror','reg:squaredlogerror','reg:absoluteerror']
n_estimators = [i for i in range(50,200,10)]
booster=['gblinear', 'gbtree']
learning_rate = [i for i in np.arange(0,2,0.2)]

In [673]:
import warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=DataConversionWarning)
Final_Series =[]
S_No = 1
Greater_R2_Score = 0
High_Performance_Comb = []
Final_Series =[]
for o in objective:
    for n in n_estimators:
        for b in booster:
            for l in learning_rate:
                dummy_Series=[]
                dummy_Series.append(S_No)
                dummy_Series.append(o)
                dummy_Series.append(n)
                dummy_Series.append(b)
                dummy_Series.append(l)
                Model = XGBRegressor(objective=o,booster=b,n_estimators=n,learning_rate=l)
                Model.fit(X_Train,Y_Train)
                Predicted_Y = Model.predict(X_Test)
                R2_score = r2_score(Y_Test,Predicted_Y)
                dummy_Series.append(R2_score)
                if R2_score > Greater_R2_Score:
                    R2_Series = []
                    Greater_R2_Score=R2_score
                    High_Performance_Col =['S.NO','objective',"n_estimators",'booster','learning_rate','R2_score']
                    R2_Series.append(S_No)
                    R2_Series.append(o)
                    R2_Series.append(n)
                    R2_Series.append(b)
                    R2_Series.append(l)
                    R2_Series.append(Greater_R2_Score)
                    High_Performance_Comb=np.array(R2_Series)
                    #print(High_Performance_Comb)
                else:
                    pass
                Final_Series.append(dummy_Series)
                S_No = S_No+1
Final_Series
Model_Evaluation = pd.DataFrame(np.array(Final_Series),columns=Columns)
Model_Evaluation = Model_Evaluation.astype({'S.NO':'int'})
Model_Evaluation = Model_Evaluation.astype({'n_estimators':'int'})
Model_Evaluation
High_Performance = pd.DataFrame([High_Performance_Comb],columns=High_Performance_Col)
High_Performance = High_Performance.astype({'S.NO':'int'})
Model_Evaluation

Unnamed: 0,S.NO,objective,n_estimators,booster,learning_rate,R2_Value
0,1,reg:squarederror,50,gblinear,0.0,-9.771979443518042e-05
1,2,reg:squarederror,50,gblinear,0.2,0.789477419526325
2,3,reg:squarederror,50,gblinear,0.4,0.7894790251540582
3,4,reg:squarederror,50,gblinear,0.6000000000000001,0.7894790320724943
4,5,reg:squarederror,50,gblinear,0.8,0.7894790366955874
...,...,...,...,...,...,...
895,896,reg:absoluteerror,190,gbtree,1.0,0.7453462746919265
896,897,reg:absoluteerror,190,gbtree,1.2000000000000002,0.6311653271421477
897,898,reg:absoluteerror,190,gbtree,1.4000000000000001,0.2774452790488353
898,899,reg:absoluteerror,190,gbtree,1.6,0.5025661240394284


In [674]:
Model_Evaluation = Model_Evaluation.astype({'n_estimators':'int'})

In [675]:
Model_Evaluation = Model_Evaluation.astype({'objective':'str'})

In [676]:
Model_Evaluation = Model_Evaluation.astype({'booster':'str'})

In [677]:
Model_Evaluation[(Model_Evaluation['n_estimators']==110) & (Model_Evaluation['objective']=='reg:absoluteerror') & (Model_Evaluation['booster']=='gbtree')]

Unnamed: 0,S.NO,objective,n_estimators,booster,learning_rate,R2_Value
730,731,reg:absoluteerror,110,gbtree,0.0,-0.0897089911989121
731,732,reg:absoluteerror,110,gbtree,0.2,0.8793354330425139
732,733,reg:absoluteerror,110,gbtree,0.4,0.8541207524673536
733,734,reg:absoluteerror,110,gbtree,0.6000000000000001,0.8193998722333479
734,735,reg:absoluteerror,110,gbtree,0.8,0.7657050083047625
735,736,reg:absoluteerror,110,gbtree,1.0,0.7544641201870482
736,737,reg:absoluteerror,110,gbtree,1.2000000000000002,0.6683898885522588
737,738,reg:absoluteerror,110,gbtree,1.4,0.334353510527959
738,739,reg:absoluteerror,110,gbtree,1.6,0.5048463167476095
739,740,reg:absoluteerror,110,gbtree,1.8,0.5193332388121441


In [678]:
# this combination from the above evaluation rsults has the greatest R2_Score
High_Performance

Unnamed: 0,S.NO,objective,n_estimators,booster,learning_rate,R2_score
0,732,reg:absoluteerror,110,gbtree,0.2,0.8793354330425139


In [659]:
Model_Evaluation

Unnamed: 0,S.NO,objective,n_estimators,booster,learning_rate,R2_Value
0,1,reg:squarederror,50,gblinear,0.0,-0.0005974189425133059
1,2,reg:squarederror,50,gblinear,0.2,0.7612116082840978
2,3,reg:squarederror,50,gblinear,0.4,0.7612112651483904
3,4,reg:squarederror,50,gblinear,0.6000000000000001,0.7612112652291102
4,5,reg:squarederror,50,gblinear,0.8,0.761211265273674
...,...,...,...,...,...,...
895,896,reg:absoluteerror,190,gbtree,1.0,0.5612106287790937
896,897,reg:absoluteerror,190,gbtree,1.2000000000000002,0.6692956378090901
897,898,reg:absoluteerror,190,gbtree,1.4000000000000001,0.4272640442254034
898,899,reg:absoluteerror,190,gbtree,1.6,0.3885906106309477


In [679]:
# As the Model_Evaluation from abovve output has lot of rows so we are saving the dataframe to csv file for verifying all the combinations 
Model_Evaluation.to_csv(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\ML_Regression_Assignment_Insurance_Data\XGBoost_Evaluation.csv")

In [683]:
XGBoost_Regressor_Model = XGBRegressor(objective='reg:absoluteerror',booster='gbtree',n_estimators=110,learning_rate=0.2)
XGBoost_Regressor_Model.fit(X_Train,Y_Train)

In [686]:
Predicted_Y = AdaBoost_Regressor_Model.predict(X_Test)
R2_score = r2_score(Y_Test,Predicted_Y)

In [687]:
R2_score

0.8545024319300922

In [688]:
# Save the model using pickle
pickle.dump(AdaBoost_Regressor_Model,open(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\ML_Regression_Assignment_Insurance_Data\Final Models\XGBoost_Regressor_Model.sav","wb"))