## Fisher Score - Chisquare test for Feature Selection
* Compute chi-squared stats between each non-negative feature(input) and class(output).
* This score is used to evaluate **categorical variable** in a classification task.

In [79]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [80]:
# df = pd.read_csv('titanic.csv')
# df.head()

In [81]:
df = sns.load_dataset('titanic')

In [82]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [83]:
## ['sex','embarked','alone','pclass','Survived']  
df = df[['sex','embarked','alone','pclass','survived']] 

In [84]:
df.head()

Unnamed: 0,sex,embarked,alone,pclass,survived
0,male,S,False,3,0
1,female,C,False,1,1
2,female,S,True,3,1
3,female,S,False,1,1
4,male,S,True,3,0


* Before applying chi square test we have to perform label encoding on categorical features

In [85]:
## label encoding
def label(df, variable):
    df[variable] = np.where(df[variable]=='male',1,0)

In [86]:
label(df,'sex')

In [87]:
df.head()

Unnamed: 0,sex,embarked,alone,pclass,survived
0,1,S,False,3,0
1,0,C,False,1,1
2,0,S,True,3,1
3,0,S,False,1,1
4,1,S,True,3,0


In [88]:
ordinal_label = {k : i for i,k in enumerate(df['embarked'].unique(), 0)}

In [89]:
ordinal_label

{'S': 0, 'C': 1, 'Q': 2, nan: 3}

In [90]:
df['embarked'] = df['embarked'].map(ordinal_label)

In [91]:
df.head()

Unnamed: 0,sex,embarked,alone,pclass,survived
0,1,0,False,3,0
1,0,1,False,1,1
2,0,0,True,3,1
3,0,0,False,1,1
4,1,0,True,3,0


In [92]:
def label_enc(df, variable):
    df[variable] = np.where(df[variable]==False,1,0)

In [93]:
label_enc(df, 'alone')

In [94]:
df.head()

Unnamed: 0,sex,embarked,alone,pclass,survived
0,1,0,1,3,0
1,0,1,1,1,1
2,0,0,0,3,1
3,0,0,1,1,1
4,1,0,0,3,0


In [95]:
df.alone.unique()

array([1, 0])

train test split to avoid overfitting

In [96]:
X = df.drop(['survived'],axis = 1)
y = df['survived']

In [97]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=0)

In [98]:
X_train

Unnamed: 0,sex,embarked,alone,pclass
857,1,0,0,1
52,0,1,1,1
386,1,0,1,3
124,1,0,1,1
578,0,1,1,3
...,...,...,...,...
835,0,1,1,1
192,0,0,1,3
629,1,2,0,3
559,0,0,1,3


### Chi Square Test

In [99]:
# chi square returns 2 values - F Score and p value
from sklearn.feature_selection import chi2
f_p_val = chi2(X_train, y_train)


1. first values are f score, more high the value more important the feature is
2. and below are p value, , more less the p value more imp that feature is

In [None]:
f_p_val


(array([63.55447864, 11.83961845, 13.84369385, 21.61080949]),
 array([1.55992554e-15, 5.79837058e-04, 1.98662308e-04, 3.33964360e-06]))

In [101]:
p_val = pd.Series(f_p_val[1])
p_val

0    1.559926e-15
1    5.798371e-04
2    1.986623e-04
3    3.339644e-06
dtype: float64

In [102]:
p_val.index = X_train.columns
p_val.index

Index(['sex', 'embarked', 'alone', 'pclass'], dtype='object')

In [103]:
p_val

sex         1.559926e-15
embarked    5.798371e-04
alone       1.986623e-04
pclass      3.339644e-06
dtype: float64

In [105]:
p_val.sort_index(ascending=False)

sex         1.559926e-15
pclass      3.339644e-06
embarked    5.798371e-04
alone       1.986623e-04
dtype: float64

1. Sex column is the most important column when compared to the output feature survived