<a href="https://colab.research.google.com/github/Santanukolkata/Data_Science/blob/master/Stats/Select_KBEST_Using_Chi2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# Import the necessary libraries first
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [12]:
# 8 different features labeled into the outcomes of 1 and 0 where 1 stands for 
# the observation has diabetes, and 0 denotes the observation does not have diabetes. 

# The dataset is known to have missing values. 

# Specifically, there are missing observations for some columns that are marked as a zero value. 

# You can deduce this by the definition of those columns, and it is impractical to have a 
# zero value is invalid for those measures, e.g., zero for body mass index or blood pressure 
# is invalid.

# Directly use the preprocessed version of the dataset.

In [2]:
# load data
url   = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(url, names=names)

dataframe.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]

In [5]:
# implement a Chi-Squared statistical test for non-negative features to select 4 
# of the best features from the dataset. 
no_of_features_to_extract = 4

test = SelectKBest(score_func = chi2, k = no_of_features_to_extract)
fit  = test.fit(X, Y)

In [6]:
# Summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]


In [7]:
features = fit.transform(X)

# Summarize selected features
print(features[0:5, :])

[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]
 [137.  168.   43.1  33. ]]


In [8]:
dataframe.columns

Index(['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'], dtype='object')

In [9]:
zipped = zip(dataframe.columns, fit.scores_ )

In [10]:
fit.scores_

array([ 111.52 , 1411.887,   17.605,   53.108, 2175.565,  127.669,
          5.393,  181.304])

In [11]:
# get the top features
for i in sorted(zipped, key = lambda t: t[1], reverse= True)[:no_of_features_to_extract]:
    print(i[0])

test
plas
age
mass
