In [156]:
#importing Libraries
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler      
from sklearn.metrics import r2_score

In [16]:
#reading the dataset file
dataset = pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [18]:
# Preprocessing Step Converting the raw data to required format (converting the categorical data to number)
dataset = pd.get_dummies(dataset,dtype = int,drop_first = True)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [20]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [22]:
#Splitting input and output values from the dataset
independent = dataset[["age","bmi","children","sex_male","smoker_yes"]]
dependent = dataset[["charges"]]

In [24]:
#Splitting the Dataset into Training dataset and Test Dataset
x_train,x_test,y_train,y_test = train_test_split(independent,dependent,test_size = 0.3,random_state = 0)

In [285]:
# Procedure for Different Regression Algorithms...

def MLR():
    mlr_regressor = LinearRegression()
    mlr_regressor.fit(x_train,y_train)     
    Models["MLR"] = get_output_as_dict(mlr_regressor,get_r2_score(mlr_regressor)) 

def SVR_MODEL():
    best_svr_model = ""  
    best_r2score = 0
    sc = StandardScaler()
    svr_train = sc.fit_transform(x_train)
    svr_test  = sc.transform(x_test)
    
    #Hyper Tuning Parameters
    svr_kernal = ["linear","rbf","poly","sigmoid"]
    svr_tuner_c = ["10","100","500","1000","2000","3000"]
    svr_model = {}
   
    for kernal_type in svr_kernal:
        for tuner_c in svr_tuner_c:             
            temp_svr_model = {
                "model" : "",
                "r2_score" : 0
            }
          
            regressor = SVR(kernel = kernal_type, C= float(tuner_c))
            regressor.fit(svr_train,y_train)    
            
            temp_svr_model["model"] = regressor
            temp_svr_model["r2_score"] = get_r2_score(regressor,svr_test)
            svr_model["SVR_" + kernal_type+"_"+tuner_c] = temp_svr_model.copy()   
 
    for svm_key in svr_model.keys():
        print(f"{svm_key} : {svr_model[svm_key]["r2_score"]}")
        if svr_model[svm_key]["r2_score"] > best_r2score:
            best_r2score = svr_model[svm_key]["r2_score"]
            best_svr_model = svm_key 
            
    Models[best_svr_model] = get_output_as_dict(svr_model[best_svr_model]["model"],best_r2score)   

#Decision Tree
def DT():
    best_dt_model = ""  
    best_dt_r2score = 0
    dt_criterion = ["squared_error","absolute_error","friedman_mse","poisson"] 
    dt_mx_features = [None,"sqrt","log2"]
    dt_splitter = ["best","random"]
    dt_model = {}

    for criterion_type in dt_criterion:
        for mx_feature in dt_mx_features:
            for splitter_type in dt_splitter:        
                temp_dt_model = {
                    "model" : "",
                    "r2_score" : 0
                } 
                regressor = DecisionTreeRegressor(criterion = criterion_type, splitter = splitter_type,max_features = mx_feature )
                regressor.fit(x_train,y_train)      
                
                temp_dt_model["model"] = regressor
                temp_dt_model["r2_score"] = get_r2_score(regressor)
                
                if mx_feature == None:
                    dt_model["DT_" +criterion_type+"_None"+"_"+splitter_type] = temp_dt_model.copy()
                else:
                    dt_model["DT_" +criterion_type+"_"+mx_feature+"_"+splitter_type] = temp_dt_model.copy()
     
    for dt_key in dt_model.keys():
        print(f"{dt_key} : {dt_model[dt_key]["r2_score"]}")
        if dt_model[dt_key]["r2_score"] > best_dt_r2score:
            best_dt_r2score = dt_model[dt_key]["r2_score"]
            best_dt_model = dt_key
    
    Models[best_dt_model] = get_output_as_dict(dt_model[best_dt_model]["model"],best_dt_r2score)         


#Random Forest
def RF():
    best_rf_model = ""  
    best_rf_r2score = 0
    rf_criterion = ["squared_error","absolute_error","friedman_mse","poisson"] 
    rf_mx_features = [None,"sqrt","log2"]
    rf_n_estimators = [10,100]
    rf_model = {}

    for criterion_type in rf_criterion:
        for mx_feature in rf_mx_features:                 
            for n_estimator in rf_n_estimators:                  
                temp_rf_model = {
                    "model" : "",
                    "r2_score" : 0
                }                 
                regressor = RandomForestRegressor(criterion = criterion_type, n_estimators = n_estimator,max_features = mx_feature,random_state = 0)
                regressor.fit(x_train,y_train)   
                
                temp_rf_model["model"] = regressor
                temp_rf_model["r2_score"] = get_r2_score(regressor)
                
                if mx_feature == None:
                    rf_model["RF_" +criterion_type+"_None"+"_"+str(n_estimator)] = temp_rf_model.copy()
                else:
                    rf_model["RF_" +criterion_type+"_"+mx_feature+"_"+str(n_estimator)] = temp_rf_model.copy()
                    
    for rf_key in rf_model.keys():
        print(f"{rf_key} : {rf_model[rf_key]["r2_score"]}")
        if rf_model[rf_key]["r2_score"] > best_rf_r2score:
            best_rf_r2score = rf_model[rf_key]["r2_score"]
            best_rf_model = rf_key
    
    Models[best_rf_model] = get_output_as_dict(rf_model[best_rf_model]["model"],best_rf_r2score)     
 

#Evaluating the Model using R2 Score Metric
def get_output_as_dict(model,r2score):
    return {
        "model" : model,
        "r2_score" : r2score
    }
    
def get_r2_score(model,x_test = x_test):
    y_pred = model.predict(x_test)
    return round(r2_score(y_test,y_pred),2)
    



In [287]:
# Creating the Model
Models = {}
bestModel = {
    "model" : "",
    "r2_score" : 0
}
print("\nGenerating the Model....")

MLR_Model = MLR()
SVR_Model = SVR_MODEL()
DT_Model = DT()
RF_Model = RF()


# print(Models)
print("\nFinal Models...\n") 
for key,value in Models.items():               
    print(f"{key} : {value["r2_score"]}")
    if value["r2_score"] > bestModel["r2_score"]:
        bestModel["model"] = key
        bestModel["r2_score"] = value["r2_score"]

print(f"\nBest Model => {bestModel["model"]} : {bestModel["r2_score"]}  ")


 




Generating the Model....


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


SVR_linear_10 : 0.46
SVR_linear_100 : 0.63
SVR_linear_500 : 0.76
SVR_linear_1000 : 0.76
SVR_linear_2000 : 0.74
SVR_linear_3000 : 0.74
SVR_rbf_10 : -0.03
SVR_rbf_100 : 0.32
SVR_rbf_500 : 0.66
SVR_rbf_1000 : 0.81
SVR_rbf_2000 : 0.85
SVR_rbf_3000 : 0.87
SVR_poly_10 : 0.04
SVR_poly_100 : 0.62
SVR_poly_500 : 0.83
SVR_poly_1000 : 0.86
SVR_poly_2000 : 0.86
SVR_poly_3000 : 0.86
SVR_sigmoid_10 : 0.04
SVR_sigmoid_100 : 0.53
SVR_sigmoid_500 : 0.44
SVR_sigmoid_1000 : 0.29
SVR_sigmoid_2000 : -0.59
SVR_sigmoid_3000 : -2.12
DT_squared_error_None_best : 0.69
DT_squared_error_None_random : 0.67
DT_squared_error_sqrt_best : 0.73
DT_squared_error_sqrt_random : 0.68
DT_squared_error_log2_best : 0.7
DT_squared_error_log2_random : 0.6
DT_absolute_error_None_best : 0.67
DT_absolute_error_None_random : 0.71
DT_absolute_error_sqrt_best : 0.73
DT_absolute_error_sqrt_random : 0.74
DT_absolute_error_log2_best : 0.64
DT_absolute_error_log2_random : 0.71
DT_friedman_mse_None_best : 0.69
DT_friedman_mse_None_random 

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

RF_squared_error_None_10 : 0.83
RF_squared_error_None_100 : 0.85
RF_squared_error_sqrt_10 : 0.85
RF_squared_error_sqrt_100 : 0.87
RF_squared_error_log2_10 : 0.85
RF_squared_error_log2_100 : 0.87
RF_absolute_error_None_10 : 0.84
RF_absolute_error_None_100 : 0.85
RF_absolute_error_sqrt_10 : 0.86
RF_absolute_error_sqrt_100 : 0.87
RF_absolute_error_log2_10 : 0.86
RF_absolute_error_log2_100 : 0.87
RF_friedman_mse_None_10 : 0.83
RF_friedman_mse_None_100 : 0.85
RF_friedman_mse_sqrt_10 : 0.85
RF_friedman_mse_sqrt_100 : 0.87
RF_friedman_mse_log2_10 : 0.85
RF_friedman_mse_log2_100 : 0.87
RF_poisson_None_10 : 0.83
RF_poisson_None_100 : 0.85
RF_poisson_sqrt_10 : 0.85
RF_poisson_sqrt_100 : 0.87
RF_poisson_log2_10 : 0.85
RF_poisson_log2_100 : 0.87

Final Models...

MLR : 0.79
SVR_rbf_3000 : 0.87
DT_poisson_sqrt_best : 0.77
RF_squared_error_sqrt_100 : 0.87

Best Model => SVR_rbf_3000 : 0.87  


In [291]:

print("\nSaving the Model....")
pickle.dump(Models[bestModel["model"]]["model"],open("InsuranceChargePredictor.sav","wb"))


Saving the Model....
