# Fisher Score - Chisquare Test For Feature Selection
# From sklearn page- sklearn.feature_selection.chi2

In [1]:
# we will be taking titanic dataset (It consists of categorical value)
import seaborn as sns
df=sns.load_dataset('titanic')

In [2]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [3]:
# we will be using the categorical features such as 'pclass', 'sex' etc.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [4]:
# Categorical features important [ 'sex','embarked','alone','pclass','Survived']
df=df[['sex','embarked','alone','pclass','survived']]

In [5]:
df.head() # evalute the categorical features based on the output features

Unnamed: 0,sex,embarked,alone,pclass,survived
0,male,S,False,3,0
1,female,C,False,1,1
2,female,S,True,3,1
3,female,S,False,1,1
4,male,S,True,3,0


In [6]:
# Before applying chisquare test , We have to do label Encoding in each and every features
# Label Encoding means assigning labels i.e. 0,1,2 ( for 3 categories)

# Label Encoding on sex column
import numpy as np
df['sex']=np.where(df['sex']=="male",1,0) # In Sex where ever there is male, there will be 1

# Label Encoding on embarked columnn ( we have more than 2 categories)
ordinal_label = { k:i for i, k in enumerate(df['embarked'].unique(),0)} # loop for checking the value imembarked column
df['embarked']=df['embarked'].map(ordinal_label) # whereever there will 'S', I will be replaced by 0 , similarly all others
# ordinal_label - how many number of unique labels are there


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sex']=np.where(df['sex']=="male",1,0) # In Sex where ever there is male, there will be 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['embarked']=df['embarked'].map(ordinal_label) # whereever there will 'S', I will be replaced by 0 , similarly all others


In [7]:
ordinal_label # 4 categories

{'S': 0, 'C': 1, 'Q': 2, nan: 3}

In [8]:
df

Unnamed: 0,sex,embarked,alone,pclass,survived
0,1,0,False,3,0
1,0,1,False,1,1
2,0,0,True,3,1
3,0,0,False,1,1
4,1,0,True,3,0
...,...,...,...,...,...
886,1,0,True,2,0
887,0,0,True,1,1
888,0,0,False,3,0
889,1,1,True,1,1


In [11]:
# Label Encoding on alone column (boolean value)
df['alone']=np.where(df['alone']==True,1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['alone']=np.where(df['alone']==True,1,0)


In [12]:
df

Unnamed: 0,sex,embarked,alone,pclass,survived
0,1,0,0,3,0
1,0,1,0,1,1
2,0,0,1,3,1
3,0,0,0,1,1
4,1,0,1,3,0
...,...,...,...,...,...
886,1,0,1,2,0
887,0,0,1,1,1
888,0,0,0,3,0
889,1,1,1,1,1


In [13]:
# Train Test Split is usually done to avoid overfitting
#train test Split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(
    df.drop('survived',axis=1),     #independet features
    df['survived'],                   #dependent features
    test_size=0.3,
    random_state=0)  
x_train.shape, x_test.shape

((623, 4), (268, 4))

In [14]:
x_train

Unnamed: 0,sex,embarked,alone,pclass
857,1,0,1,1
52,0,1,0,1
386,1,0,0,3
124,1,0,0,1
578,0,1,0,3
...,...,...,...,...
835,0,1,0,1
192,0,0,0,3
629,1,2,1,3
559,0,0,0,3


In [15]:
# check null value 
x_train.isnull().sum()

sex         0
embarked    0
alone       0
pclass      0
dtype: int64

In [16]:
x_train['sex'].unique()

array([1, 0])

In [18]:
# Perform chi2 test
from sklearn.feature_selection import chi2
chi2(x_train,y_train)
f_p_values=chi2(x_train,y_train)
# It returns 2 values (F score and  P value )
# F score - the more the higher the f score value is, the more important the feature  is
# P value - the lesser the p value ,the more important that particular features is 

In [19]:
f_p_values

(array([63.55447864, 11.83961845,  9.03328564, 21.61080949]),
 array([1.55992554e-15, 5.79837058e-04, 2.65107556e-03, 3.33964360e-06]))

In [20]:
# convert the value in series( sort the values)
import pandas as pd
p_values=pd.Series(f_p_values[1])
p_values.index=x_train.columns
p_values

sex         1.559926e-15
embarked    5.798371e-04
alone       2.651076e-03
pclass      3.339644e-06
dtype: float64

In [21]:
p_values.sort_index(ascending=False) 
# smaller to larger

sex         1.559926e-15
pclass      3.339644e-06
embarked    5.798371e-04
alone       1.986623e-04
dtype: float64

In [22]:
### observation
# in titanic movie , mostly male died 
# Sex column is the most important features when comparedto output features "Survived"