Fisher score selects feature independently according to their scores under the Fisher criterion,which leads to a suboptimal subset of features.

Chi-square test measures dependence between stochastic variables(확률변수), so using this function weed out the features that are most likely to be independent of class and therefore irrelevant for classification

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest, SelectPercentile

In [3]:
titanic=sns.load_dataset('titanic')

In [4]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [5]:
titanic.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [6]:
titanic.drop(labels=['age','deck'],axis=1,inplace=True)

In [7]:
titanic=titanic.dropna() #embark town 에서 정보 없는 2명 데이터 없어짐

In [8]:
titanic.isnull().sum()

survived       0
pclass         0
sex            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [9]:
titanic

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


In [34]:
data=titanic[['pclass','sex','sibsp','parch','embarked','who','alone']].copy()

In [35]:
data.head()

Unnamed: 0,pclass,sex,sibsp,parch,embarked,who,alone
0,3,male,1,0,S,man,False
1,1,female,1,0,C,woman,False
2,3,female,0,0,S,woman,True
3,1,female,1,0,S,woman,False
4,3,male,0,0,S,man,True


In [36]:
sex={'male':0,'female':1}
data['sex']=data['sex'].map(sex)

embark={'S':0,'C':1,'Q':2}
data['embarked']=data['embarked'].map(embark)

who={'man':0,'woman':1,'child':2}
data['who']=data['who'].map(who)

alone={True:1,False:0}
data['alone']=data['alone'].map(alone)

In [37]:
data.head()

Unnamed: 0,pclass,sex,sibsp,parch,embarked,who,alone
0,3,0,1,0,0,0,0
1,1,1,1,0,1,1,0
2,3,1,0,0,0,1,1
3,1,1,1,0,0,1,0
4,3,0,0,0,0,0,1


### Do F-Test

In [38]:
x=data.copy()
y=titanic['survived']

In [39]:
x.shape , y.shape

((889, 7), (889,))

In [41]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [43]:
f_score=chi2(x_train,y_train)

In [44]:
f_score #첫번째arrary:F-value , 두번째array:P-value

(array([ 22.65169202, 152.91534343,   0.52934285,  10.35663782,
         16.13255653, 161.42431175,  13.4382363 ]),
 array([1.94189138e-06, 3.99737147e-35, 4.66883271e-01, 1.29009955e-03,
        5.90599986e-05, 5.52664700e-37, 2.46547298e-04]))

In [45]:
p_values=pd.Series(f_score[1],index=x_train.columns)
p_values.sort_values(ascending=True,inplace=True)

In [48]:
p_values

who         5.526647e-37
sex         3.997371e-35
pclass      1.941891e-06
embarked    5.906000e-05
alone       2.465473e-04
parch       1.290100e-03
sibsp       4.668833e-01
dtype: float64

In [78]:
x_train_2=x_train[['who','sex']]
x_test_2=x_test[['who','sex']]

x_train_3=x_train[['who','sex','pclass']]
x_test_3=x_test[['who','sex','pclass']]

x_train_4=x_train[['who','sex','pclass','embarked']]
x_test_4=x_test[['who','sex','pclass','embarked']]

x_train_5=x_train[['who','sex','pclass','embarked','alone']]
x_test_5=x_test[['who','sex','pclass','embarked','alone']]

In [50]:
def run_randomforest(x_train,x_test,y_train,y_test):
    clf=RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1)
    clf.fit(x_train,y_train)
    y_pred=clf.predict(x_test)
    print('Accuracy:',accuracy_score(y_test,y_pred))

In [79]:
%%time
run_randomforest(x_train_2,x_test_2,y_train,y_test)

Accuracy: 0.7191011235955056
Wall time: 251 ms


In [80]:
%%time
run_randomforest(x_train_3,x_test_3,y_train,y_test)

Accuracy: 0.7415730337078652
Wall time: 255 ms


In [81]:
%%time
run_randomforest(x_train_4,x_test_4,y_train,y_test)

Accuracy: 0.7584269662921348
Wall time: 256 ms


In [82]:
%%time
run_randomforest(x_train_5,x_test_5,y_train,y_test)

Accuracy: 0.7528089887640449
Wall time: 253 ms


In [83]:
%%time
run_randomforest(x_train,x_test,y_train,y_test)

Accuracy: 0.7359550561797753
Wall time: 260 ms
