In [134]:
# import all necessary libraries used for SVM regressor model creation
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import r2_score
import pickle

In [135]:
# Loading the data set using pandas 

Data = pd.read_csv(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\ML_Regression_Assignment_Insurance_Data\DataSet\insurance_pre.csv")

In [136]:
Data.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


In [137]:
# We can see the shape of the data fro  below code

Data.shape

(1338, 6)

In [138]:
# From above output we got to know that there are 1338 rows and 6 columns on our data set
# There are four numerical columns and two category columns

In [139]:
# Lets convert the category columns into numeric columns by one-hot-encoding using pandas get_dummies method

Data = pd.get_dummies(Data,drop_first=True)

In [140]:
Data.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.9,0,16884.924,0,1
1,18,33.77,1,1725.5523,1,0
2,28,33.0,3,4449.462,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.88,0,3866.8552,1,0


In [141]:
Data.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [142]:
# Lets re-arrange the column values for better visualization
Data = Data.loc[:,['age', 'bmi', 'children','sex_male', 'smoker_yes','charges']]

In [143]:
Data.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,charges
0,19,27.9,0,0,1,16884.924
1,18,33.77,1,1,0,1725.5523
2,28,33.0,3,1,0,4449.462
3,33,22.705,0,1,0,21984.47061
4,32,28.88,0,1,0,3866.8552


In [144]:
# Lets check if data has any NA values in it 

Data[Data.isna().any(axis=1)]

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,charges


In [145]:
# Lets check if data has any Null values in it 
Data[Data.isnull().any(axis=1)]

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,charges


In [146]:
# Lets divide the data into dependent/output and independent column/input columns of the data

dependent = Data[["charges"]]
independent = Data[['age', 'bmi', 'children','sex_male', 'smoker_yes']]

## We can observe that all the columns are in different measuring scale so we can standardize the data to convert tem into same scale

In [147]:
Std = StandardScaler()

In [148]:
independent = Std.fit_transform(independent)

In [149]:
independent

array([[-1.43876426, -0.45332   , -0.90861367, -1.0105187 ,  1.97058663],
       [-1.50996545,  0.5096211 , -0.07876719,  0.98959079, -0.5074631 ],
       [-0.79795355,  0.38330685,  1.58092576,  0.98959079, -0.5074631 ],
       ...,
       [-1.50996545,  1.0148781 , -0.90861367, -1.0105187 , -0.5074631 ],
       [-1.29636188, -0.79781341, -0.90861367, -1.0105187 , -0.5074631 ],
       [ 1.55168573, -0.26138796, -0.90861367, -1.0105187 ,  1.97058663]])

In [150]:
# convert independent arrays again into data frame for understanding
Data_independent = pd.DataFrame(independent,columns=['age', 'bmi', 'children','sex_male', 'smoker_yes'])
Data_independent['charges']=dependent

In [151]:
Data_independent.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,charges
0,-1.438764,-0.45332,-0.908614,-1.010519,1.970587,16884.924
1,-1.509965,0.509621,-0.078767,0.989591,-0.507463,1725.5523
2,-0.797954,0.383307,1.580926,0.989591,-0.507463,4449.462
3,-0.441948,-1.305531,-0.908614,0.989591,-0.507463,21984.47061
4,-0.513149,-0.292556,-0.908614,0.989591,-0.507463,3866.8552


In [152]:
# Lets spli the data into traing and test data set for developing and testing the models

X_Train,X_Test,Y_Train,Y_Test = train_test_split(independent,dependent,random_state=0,test_size=0.30)


In [153]:
X_Train

array([[-1.50996545, -0.40164599, -0.90861367, -1.0105187 , -0.5074631 ],
       [-0.01474046,  0.35049795, -0.90861367, -1.0105187 , -0.5074631 ],
       [ 0.91087502,  2.63891845,  3.24061871, -1.0105187 , -0.5074631 ],
       ...,
       [ 0.05646073, -0.91592544, -0.90861367,  0.98959079, -0.5074631 ],
       [-1.43876426,  0.79833938, -0.90861367,  0.98959079, -0.5074631 ],
       [-0.4419476 , -1.99533811, -0.07876719, -1.0105187 , -0.5074631 ]])

In [154]:
Y_Train.head()

Unnamed: 0,charges
1163,2200.83085
196,5649.715
438,12592.5345
183,7419.4779
1298,5261.46945


In [155]:
X_Train.shape

(936, 5)

In [156]:
X_Test.shape

(402, 5)

In [157]:
# Lets create and Train the model using Training X_Train and Y_Train data sets

SVM_Regression_Model = SVR()
SVM_Regression_Model.fit(X_Train,Y_Train)

In [158]:
#Lets Test the model using our X_Test data

Predicted_Y = SVM_Regression_Model.predict(X_Test)

In [159]:
# Evaluate the r2_score for Y actual and Y predicted by the model
R2_Score = r2_score(Y_Test,Predicted_Y)

In [160]:
R2_Score


-0.08343585041352841

# Lets Create the Hyper-parameter search table using the below code for above SVM regressor Model

In [161]:
Kernel = ['linear', 'poly', 'rbf', 'sigmoid']
C_Values = [1.0,10,100,1000,5000,10000,0.1,0.01,0.001,0.0001]
Columns=['S.NO','C-Values',"Kernel:linear","Kernel:poly","Kernel:rbf","Kernel:sigmoid"]

In [162]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=DataConversionWarning)
Final_Series =[]
S_No = 1
Greater_R2_Score = 0
High_Performance_Comb = []
for val in C_Values:
    dummy_Series=[]
    dummy_Series.append(S_No)
    dummy_Series.append(str(val))
    for k in Kernel:
        SVM_Regressor_Model = SVR()
        SVM_Regressor_Model = SVR(kernel=k,C=val)
        SVM_Regressor_Model.fit(X_Train,Y_Train)
        Predicted_Y = SVM_Regressor_Model.predict(X_Test)
        R2_score = r2_score(Y_Test,Predicted_Y)
        dummy_Series.append(R2_score)
        if R2_score > Greater_R2_Score:
            R2_Series = []
            Greater_R2_Score=R2_score
            High_Performance_Col =['S.NO','C-value',"Kernel:{}".format(k)]
            R2_Series.append(S_No)
            R2_Series.append(str(val))
            R2_Series.append(Greater_R2_Score)
            High_Performance_Comb=np.array(R2_Series)
            print(High_Performance_Comb)
        else:
            pass
    #print(dummy_Series)
    Final_Series.append(dummy_Series)
    #dummy_Series = pd.Series(dummy_Series)
    #Data_Frame.append(dummy_Series.set_axis(Data_Frame.columns).to_frame().T)
    S_No = S_No+1
Model_Evaluation = pd.DataFrame(np.array(Final_Series),columns=Columns)
Model_Evaluation = Model_Evaluation.astype({'S.NO':'int'})
High_Performance = pd.DataFrame([High_Performance_Comb],columns=High_Performance_Col)
High_Performance = High_Performance.astype({'S.NO':'int'})
Model_Evaluation

['2' '10' '0.45958029276441337']
['3' '100' '0.6276501359169483']
['4' '1000' '0.764931290606295']
['4' '1000' '0.854420696873579']
['5' '5000' '0.8579024939516943']
['5' '5000' '0.8748747155328088']
['6' '10000' '0.8779176067658443']


Unnamed: 0,S.NO,C-Values,Kernel:linear,Kernel:poly,Kernel:rbf,Kernel:sigmoid
0,1,1.0,-0.0105434077751973,-0.0760954953181871,-0.0834358504135284,-0.0755368223394976
1,2,10.0,0.4595802927644133,0.0351378718862509,-0.0327793744017828,0.0382329381159491
2,3,100.0,0.6276501359169483,0.6139950817219111,0.3173551636336614,0.5219161138070013
3,4,1000.0,0.764931290606295,0.854420696873579,0.8104257702399931,0.2224854343743912
4,5,5000.0,0.7414179787462394,0.8579024939516943,0.8748747155328088,-7.022207349410854
5,6,10000.0,0.7414227747333042,0.8577398907005558,0.8779176067658443,-35.2702182609943
6,7,0.1,-0.0809980247453892,-0.0883422449663384,-0.0890798963893113,-0.0882807301539887
7,8,0.01,-0.0888351685785777,-0.0895722745624476,-0.0896460758638788,-0.0895660991562652
8,9,0.001,-0.0896215936303668,-0.089695331068371,-0.0897027115601054,-0.089694713288503
9,10,0.0001,-0.0897002632422308,-0.0897076372544267,-0.0897083753072158,-0.0897075754740472


In [163]:
# This is the best parameter combination that has greatest r2_score from the above table
High_Performance

Unnamed: 0,S.NO,C-value,Kernel:rbf
0,6,10000,0.8779176067658443


In [164]:
# Lets save the model with high performance
SVM_Regressor_Model = SVR()
SVM_Regressor_Model = SVR(kernel='rbf',C=10000)
SVM_Regressor_Model.fit(X_Train,Y_Train)
Predicted_Y = SVM_Regressor_Model.predict(X_Test)
R2_score = r2_score(Y_Test,Predicted_Y)

In [165]:
R2_score

0.8779176067658443

In [92]:
# Saving the model using the pickle

pickle.dump(SVM_Regressor_Model,open(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\ML_Regression_Assignment_Insurance_Data\Final Models/SVM_Regressor_Model_Final.sav",'wb'))

In [93]:
# Load the model and test for a single input randomly
SVM_Regressor_Model = pickle.load(open(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\ML_Regression_Assignment_Insurance_Data\Final Models/SVM_Regressor_Model_Final.sav",'rb'))

In [96]:
SVM_Regressor_Model.predict([[-1.50996545, -0.40164599, -0.90861367, -1.0105187 , -0.5074631 ]])

array([2200.73071273])