In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest


In [23]:
def select_kbest(indep_x,dep_y,n):
    test=SelectKBest(score_func=f_regression,k=n)
    fit1=test.fit(indep_x,dep_y)
    select_k=fit1.transform(indep_x)
    return select_k

def split_scaler(indep_x,dep_y):
    x_train,x_test,y_train,y_test = train_test_split(indep_x,dep_y,test_size=0.25,random_state=0)
    sc=StandardScaler()
    x_train=sc.fit_transform(x_train)
    x_test=sc.transform(x_test)
    return x_train,x_test,y_train,y_test

def r2_prediction(regressor,x_test,y_test):
    y_pred=regressor.predict(x_test)
    from sklearn.metrics import r2_score
    r2score=r2_score(y_test,y_pred)
    return r2score
    
def linear(x_train,y_train,x_test,y_test):
    from sklearn.linear_model import LinearRegression
    regressor=LinearRegression()
    regressor.fit(x_train,y_train)
    r2score=r2_prediction(regressor,x_test,y_test)
    return r2score

def svm_l(x_train,y_train,x_test,y_test):
    from sklearn.svm import SVR
    regressor=SVR(kernel='linear')
    regressor.fit(x_train,y_train)
    r2score=r2_prediction(regressor,x_test,y_test)
    return r2score

def svm_nl(x_train,y_train,x_test,y_test):
    from sklearn.svm import SVR
    regressor=SVR(kernel='rbf')
    regressor.fit(x_train,y_train)
    r2score=r2_prediction(regressor,x_test,y_test)
    return r2score

def DTree(x_train,y_train,x_test,y_test):
    from sklearn.tree import DecisionTreeRegressor
    regressor=DecisionTreeRegressor(random_state=0)
    regressor.fit(x_train,y_train)
    r2score=r2_prediction(regressor,x_test,y_test)
    return r2score

def RForest(indep_x,dep_y):  

    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import RandomForestRegressor

    param_grid={'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
             'n_estimators': [10,100,25,50],
              'max_features':['sqrt','log2']}

    grid=GridSearchCV(RandomForestRegressor(),param_grid,refit=True,verbose=3,n_jobs=-1)
    grid.fit(indep_x,dep_y)
    rscore=grid.best_score_
    return rscore
    
    #from sklearn.ensemble import RandomForestRegressor
    #regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
   # regressor.fit(x_train, y_train)
    #r2score=r2_prediction(regressor,x_test,y_test)
    #return  r2score

def selectK_table(acclin,accsvm_l,accsvm_nl,accDTree,accRForest):
    dataframe=pd.DataFrame(index=['chi_square'],columns=['linear','svm_l','svm_nl','DTree','RForest'])
    for number,idex in enumerate(dataframe.index):
        dataframe['linear'][idex]=acclin[number]
        dataframe['svm_l'][idex]=accsvm_l[number]
        dataframe['svm_nl'][idex]=accsvm_nl[number]
        dataframe['DTree'][idex]=accDTree[number]
        dataframe['RForest'][idex]=accRForest[number]
        return dataframe

In [24]:
dataset=pd.read_csv('insurance_pre.csv',index_col=None)
df1=dataset

In [25]:
df1

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [26]:
df1=pd.get_dummies(df1, drop_first=True)

In [27]:
df1

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,False,True
1,18,33.770,1,1725.55230,True,False
2,28,33.000,3,4449.46200,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.880,0,3866.85520,True,False
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False
1334,18,31.920,0,2205.98080,False,False
1335,18,36.850,0,1629.83350,False,False
1336,21,25.800,0,2007.94500,False,False


In [28]:
indep_x=df1.drop('charges',axis=1)
dep_y=df1['charges']

In [39]:
select_k=select_kbest(indep_x,dep_y,5)
r2lin=[]
r2svm_l=[]
r2svm_nl=[]
r2DTree=[]
r2RForest=[]

In [40]:
x_train,x_test,y_train,y_test = split_scaler(select_k,dep_y)

r2linear=linear(x_train,y_train,x_test,y_test)
r2lin.append(r2linear)

r2svm_lin=svm_l(x_train,y_train,x_test,y_test)
r2svm_l.append(r2svm_lin)

r2svm_nonlin=svm_nl(x_train,y_train,x_test,y_test)
r2svm_nl.append(r2svm_nonlin)

r2DecisionTree=DTree(x_train,y_train,x_test,y_test)
r2DTree.append(r2DecisionTree)

r2RandomForest=RForest(indep_x,dep_y)
r2RForest.append(r2RandomForest)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [41]:
result=selectK_table(r2lin,r2svm_l,r2svm_nl,r2DTree,r2RForest)

In [32]:
result
#3

Unnamed: 0,linear,svm_l,svm_nl,DTree,RForest
chi_square,0.792433,-0.010114,-0.089312,0.77638,0.835647


In [36]:
result
#4

Unnamed: 0,linear,svm_l,svm_nl,DTree,RForest
chi_square,0.795238,-0.010146,-0.091239,0.730872,0.835038


In [42]:
result
#5

Unnamed: 0,linear,svm_l,svm_nl,DTree,RForest
chi_square,0.795217,-0.011016,-0.092978,0.712093,0.835761
