In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest


In [69]:
def select_kbest(indep_x,dep_y,n):
    test=SelectKBest(score_func=chi2,k=n)
    fit1=test.fit(indep_x,dep_y)
    select_k=fit1.transform(indep_x)
    return select_k

def split_scaler(indep_x,dep_y):
    x_train,x_test,y_train,y_test = train_test_split(indep_x,dep_y,test_size=0.25,random_state=0)
    sc=StandardScaler()
    x_train=sc.fit_transform(x_train)
    x_test=sc.transform(x_test)
    return x_train,x_test,y_train,y_test

def r2_prediction(regressor,x_test,y_test):
    y_pred=regressor.predict(x_test)
    from sklearn.metrics import r2_score
    r2score=r2_score(y_test,y_pred)
    return r2score
    
def linear(x_train,y_train,x_test,y_test):
    from sklearn.linear_model import LinearRegression
    regressor=LinearRegression()
    regressor.fit(x_train,y_train)
    r2score=r2_prediction(regressor,x_test,y_test)
    return r2score

def svm_l(x_train,y_train,x_test,y_test):
    from sklearn.svm import SVR
    regressor=SVR(kernel='linear')
    regressor.fit(x_train,y_train)
    r2score=r2_prediction(regressor,x_test,y_test)
    return r2score

def svm_nl(x_train,y_train,x_test,y_test):
    from sklearn.svm import SVR
    regressor=SVR(kernel='rbf')
    regressor.fit(x_train,y_train)
    r2score=r2_prediction(regressor,x_test,y_test)
    return r2score

def DTree(x_train,y_train,x_test,y_test):
    from sklearn.tree import DecisionTreeRegressor
    regressor=DecisionTreeRegressor(random_state=0)
    regressor.fit(x_train,y_train)
    r2score=r2_prediction(regressor,x_test,y_test)
    return r2score

def RForest(indep_x,dep_y):  

    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import RandomForestRegressor

    param_grid={'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
             'n_estimators': [10,100,25,50],
              'max_features':['sqrt','log2']}

    grid=GridSearchCV(RandomForestRegressor(),param_grid,refit=True,verbose=3,n_jobs=-1)
    grid.fit(indep_x,dep_y)
    rscore=grid.best_score_
    return rscore
    
    #from sklearn.ensemble import RandomForestRegressor
    #regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
   # regressor.fit(x_train, y_train)
    #r2score=r2_prediction(regressor,x_test,y_test)
    #return  r2score

def selectK_table(acclin,accsvm_l,accsvm_nl,accDTree,accRForest):
    dataframe=pd.DataFrame(index=['chi_square'],columns=['linear','svm_l','svm_nl','DTree','RForest'])
    for number,idex in enumerate(dataframe.index):
        dataframe['linear'][idex]=acclin[number]
        dataframe['svm_l'][idex]=accsvm_l[number]
        dataframe['svm_nl'][idex]=accsvm_nl[number]
        dataframe['DTree'][idex]=accDTree[number]
        dataframe['RForest'][idex]=accRForest[number]
        return dataframe

In [71]:
df1=pd.get_dummies(df1, drop_first=True)

In [72]:
df1

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,False,False,False,False,False,False,True,True,False,True
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,True,False,False,False,False,False,True,False,False,True
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,True,False,False,False,False,False,True,False,False,True
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,True,False,False,False,False,False,True,False,True,True
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,True,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,True,False,False,False,False,False,True,False,False,True
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,True,False,False,True,True,False,True,False,True,True
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,True,False,False,True,True,False,False,False,False,True
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,True,False,False,True,True,False,True,False,True,True


In [73]:
indep_x=df1.drop('classification_yes',axis=1)
dep_y=df1['classification_yes']

In [74]:
select_k=select_kbest(indep_x,dep_y,8)
r2lin=[]
r2svm_l=[]
r2svm_nl=[]
r2DTree=[]
r2RForest=[]

In [75]:
x_train,x_test,y_train,y_test = split_scaler(select_k,dep_y)

r2linear=linear(x_train,y_train,x_test,y_test)
r2lin.append(r2linear)

r2svm_lin=svm_l(x_train,y_train,x_test,y_test)
r2svm_l.append(r2svm_lin)

r2svm_nonlin=svm_nl(x_train,y_train,x_test,y_test)
r2svm_nl.append(r2svm_nonlin)

r2DecisionTree=DTree(x_train,y_train,x_test,y_test)
r2DTree.append(r2DecisionTree)

r2RandomForest=RForest(indep_x,dep_y)
r2RForest.append(r2RandomForest)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [76]:
result=selectK_table(r2lin,r2svm_l,r2svm_nl,r2DTree,r2RForest)

In [40]:
result
#3

Unnamed: 0,linear,svm_l,svm_nl,DTree,RForest
chi_square,0.287968,0.255063,0.3335,0.262153,0.528212


In [44]:
result
#4

Unnamed: 0,linear,svm_l,svm_nl,DTree,RForest
chi_square,0.304963,0.256858,0.430795,0.479167,0.599392


In [48]:
result
#6

Unnamed: 0,linear,svm_l,svm_nl,DTree,RForest
chi_square,0.599041,0.586446,0.838962,0.869792,0.897569


In [52]:
result
#7

Unnamed: 0,linear,svm_l,svm_nl,DTree,RForest
chi_square,0.657035,0.641906,0.893007,0.826389,0.916233


In [77]:
result
#8 integrated gridSearchCV in Random Forest

Unnamed: 0,linear,svm_l,svm_nl,DTree,RForest
chi_square,0.646457,0.612199,0.891274,0.869792,0.944554
