<p></p><p>A <b>chi-squared test</b>, also written as <b><span class="texhtml"><i>χ</i><sup>2</sup></span> test</b>, is any <a href="/wiki/Statistical_hypothesis_testing" title="Statistical hypothesis testing">statistical hypothesis test</a> where the <a href="/wiki/Sampling_distribution" title="Sampling distribution">sampling distribution</a> of the test statistic is a <a href="/wiki/Chi-squared_distribution" title="Chi-squared distribution">chi-squared distribution</a> when the <a href="/wiki/Null_hypothesis" title="Null hypothesis">null hypothesis</a> is true.  Without other qualification, 'chi-squared test' often is used as short for <a href="/wiki/Pearson%27s_chi-squared_test" title="Pearson's chi-squared test"><i>Pearson's</i> chi-squared test</a>. The chi-squared test is used to determine whether there is a significant difference between the expected frequencies and the observed frequencies in one or more categories.
</p>
    <p style="margin-left:85%"><i>Source: <b>Wikipedia<b></b></b></i></p><b><b>
    <br>
 <img src="https://upload.wikimedia.org/wikipedia/commons/8/8e/Chi-square_distributionCDF-English.png" width="800px" height="600px"><p></p>
<p></p><h3><i>
    <b>Video Tutorials<b>
</b></b></i></h3><b><b>
<a target="_blank" href="https://www.youtube.com/watch?v=jABsbNBPXIk">Chi-square statistic for hypothesis testing | AP Statistics | Khan Academy</a>
    <br>
<a target="_blank" href="https://www.youtube.com/watch?v=zOvUQWOzTlc">Chi-square test for association (independence) | AP Statistics | Khan Academy</a>
<br>
    <br><p></p>
<p></p><h3><i>
    <b>Reading Material<b>
</b></b></i></h3><b><b>
<a target="_blank" href="https://machinelearningmastery.com/chi-squared-test-for-machine-learning/">A Gentle Introduction to the Chi-Squared Test for Machine Learning</a><p></p>

</b></b></b></b></b></b>

In [None]:
import pandas
import numpy
import matplotlib
from matplotlib import pyplot
from sklearn import preprocessing
%matplotlib inline
pandas.set_option('display.max_rows', 500)
pandas.set_option('display.max_columns', 500)
pandas.set_option('display.width', 1000)
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency
from scipy.stats import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2 as sklearn_chi2
from sklearn.preprocessing.imputation import Imputer

In [None]:
data_frame= pandas.read_table("../input/pima-data.csv", sep=',')

In [None]:
data_frame.shape

In [None]:
data_frame.head()

In [None]:
for col in data_frame.columns:
    data_frame[col] = data_frame[col].map(lambda x: numpy.nan if isinstance(x, str) and '\t?' in x else(x[1:] if isinstance(x, str) and '\t' in x and x.find('\t')==0 else (x[:-1] if isinstance(x, str) and '\t' in x and x.find('\t')>0 else x)))

In [None]:
print((data_frame[data_frame.columns] == 0).sum())

In [None]:
data_frame[data_frame.columns] = data_frame[data_frame.columns].replace(0, numpy.NaN)

In [None]:
print(data_frame.isnull().sum())

In [None]:
data_frame.fillna(value=data_frame.mean(), inplace=True)

In [None]:
labelencoder=preprocessing.LabelEncoder()
data_frame.diabetes=pandas.Series(data=labelencoder.fit_transform(data_frame.diabetes), index=data_frame.index)

In [None]:
diabetes_col=data_frame.diabetes

In [None]:
data_frame.head()

In [None]:
data_frame.drop(labels=['diabetes'], axis=1, inplace=True)
data_frame=pandas.concat([data_frame, diabetes_col], axis=1)

In [None]:
data_frame.head()

In [None]:
iteration_val=len(data_frame.columns)-1
selecte_feature_index=[]

In [None]:
for i in range(0,iteration_val,1):
    print('Feature Name : {0}'.format(data_frame.columns[i]))
    table= data_frame[data_frame.iloc[:,[i,iteration_val]].columns].values
    chi_squared_stat, p_value, dof, expected=chi2_contingency(table)
    print(chi_squared_stat)
    probability=0.95
    critical=chi2.ppf(probability, dof)
    if abs(chi_squared_stat)>=critical:
        print('Dependent : Reject Hypothesis 0 (null Hypothesis)')
    elif abs(chi_squared_stat)<critical:
        print('Inependent : Fail to Reject Hypothesis 0 (null Hypothesis)')
        selecte_feature_index.append(i)
    
    alpha = 1.0 - probability
    print('Significance {0}, {1}'.format(alpha, p_value))
    if p_value<=alpha:
        print("Dependent : Reject Hypothesis 0 (null Hypothesis)")
    else:
        print('Inependent : Fail to Reject Hypothesis 0 (null Hypothesis)')
        selecte_feature_index.append(i)
    print('================================================================')

# Chi-Square For Feature Selection

In [None]:
array=data_frame.values

In [None]:
array

In [None]:
X = data_frame.iloc[:,0:-1]
y = data_frame.iloc[:,-1:]    #target column i.e price range

In [None]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=sklearn_chi2, k='all')
fit = bestfeatures.fit(X,y)

In [None]:
fit.scores_

In [None]:
dfscores = pandas.DataFrame(fit.scores_)
dfcolumns = pandas.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pandas.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']
print(featureScores.nlargest(8,'Score'))