In [58]:
%autosave 0

Autosave disabled


The chi^2 test will determine if membership to one group affects membership to another.

In [59]:
import numpy as np
import pandas as pd

from pydataset import data
from scipy import stats

Let's read in the mpg dataset from pydataset!

In [60]:
mpg = data('mpg')
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


Let's do some feature engineering.

Our goal is to compare above/below average mpg to automatic/manual transmission.

In [61]:
mpg['mean_mpg'] = (mpg.cty + mpg.hwy)/2
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class,mean_mpg
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,23.5
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,25.0
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,25.5
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,25.5
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,21.0


In [62]:
#i want high mpg and low mpg into halves
pd.qcut(mpg.mean_mpg, 2, labels = ['low_mpg', 'high_mpg'])
#i am splitting this into two halves based on the median

1      high_mpg
2      high_mpg
3      high_mpg
4      high_mpg
5      high_mpg
         ...   
230    high_mpg
231    high_mpg
232    high_mpg
233    high_mpg
234    high_mpg
Name: mean_mpg, Length: 234, dtype: category
Categories (2, object): ['low_mpg' < 'high_mpg']

In [63]:
#when you are given featuers in a data set, you can take this continuous to makea binary 
mpg['mpg_cat'] = pd.qcut(mpg.mean_mpg, 2 , labels = ['low_mpg', 'high_mpg'])
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class,mean_mpg,mpg_cat
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,23.5,high_mpg
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,25.0,high_mpg
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,25.5,high_mpg
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,25.5,high_mpg
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,21.0,high_mpg


In [64]:
mpg.mpg_cat.value_counts()

low_mpg     121
high_mpg    113
Name: mpg_cat, dtype: int64

In [65]:
mpg['trans_bin']=np.where(mpg.trans.str.startswith('a'), 'auto', 'manual')
mpg.head()


Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class,mean_mpg,mpg_cat,trans_bin
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,23.5,high_mpg,auto
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,25.0,high_mpg,manual
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,25.5,high_mpg,manual
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,25.5,high_mpg,auto
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,21.0,high_mpg,auto


Now that we have our categorical features, we can prepare to run a chi^2 contingency test!

First, we need to define our null and alternative hypotheses.

Null hypothesis: transmission type does NOT affect mpg.
Alternative hypothesis: transmission type DOES affect mpg.

We need to generate a crosstab of our two categorical features.

Once we have the crosstab, we can run the test!

In [66]:
pd.crosstab(mpg.mean_mpg, mpg.trans_bin)

trans_bin,auto,manual
mean_mpg,Unnamed: 1_level_1,Unnamed: 2_level_1
10.5,4,1
12.5,2,0
13.0,10,0
13.5,2,1
14.0,4,3
14.5,2,1
15.0,17,0
15.5,11,2
16.0,5,3
16.5,4,1


In [67]:
ct = pd.crosstab(mpg.mpg_cat, mpg.trans_bin)  #these are somethingyou can feed into chi square test
ct

trans_bin,auto,manual
mpg_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
low_mpg,95,26
high_mpg,62,51


The test will return four values:
- The test statistic (chi^2)
- The p-value
- The degrees of freedom (sample size minus 1)
- The table of expected values, if the two features were independent of each other

In [72]:
chi, p, degf, exp =

SyntaxError: invalid syntax (2182096460.py, line 1)

In [None]:
a = 0.05. #if p is less than a meanthere is some significant difference
if p < a:
    print('We reject the null hypothesis.')
else:
        print('We fail to reject the null hypothesis.')

In [71]:
exp

array([[81.18376068, 39.81623932],
       [75.81623932, 37.18376068]])

In [70]:
ct

trans_bin,auto,manual
mpg_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
low_mpg,95,26
high_mpg,62,51


In [74]:
degf

1

#if we were to do it poorly

In [73]:
mpg.shape


(234, 14)

In [77]:
pd.crosstab(mpg.trans_bin, mpg['class'])  #class is a reserve word in Python 

In [80]:
bad_ct = pd.crosstab(mpg.trans_bin, mpg['class'])
bad_ct

class,2seater,compact,midsize,minivan,pickup,subcompact,suv
trans_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
auto,2,24,29,11,20,16,55
manual,3,23,12,0,13,19,7


In [81]:
chitwo,ptwo,degftwo,exptwo = stats.chi2_contingency(bad_ct)

In [82]:
ptwo

7.4513729838250925e-06

In [83]:
exptwo

array([[ 3.35470085, 31.53418803, 27.50854701,  7.38034188, 22.14102564,
        23.48290598, 41.5982906 ],
       [ 1.64529915, 15.46581197, 13.49145299,  3.61965812, 10.85897436,
        11.51709402, 20.4017094 ]])

In [84]:
bad_ct

class,2seater,compact,midsize,minivan,pickup,subcompact,suv
trans_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
auto,2,24,29,11,20,16,55
manual,3,23,12,0,13,19,7
