<a href="https://colab.research.google.com/github/WriterTyper/hypotheses-steps/blob/main/Crosstabs_Chisq_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Example from Jeffrey Hanif Watson: 
https://towardsdatascience.com/contingency-tables-chi-squared-and-cramers-v-ada4f93ec3fd

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('studentInfo.csv')

In [9]:
import statsmodels.api as sm

  import pandas.util.testing as tm


In [17]:
from scipy import stats

In [3]:
print(df)

      code_module code_presentation  ...  disability final_result
0             AAA             2013J  ...           N         Pass
1             AAA             2013J  ...           N         Pass
2             AAA             2013J  ...           Y    Withdrawn
3             AAA             2013J  ...           N         Pass
4             AAA             2013J  ...           N         Pass
...           ...               ...  ...         ...          ...
32588         GGG             2014J  ...           N         Fail
32589         GGG             2014J  ...           N  Distinction
32590         GGG             2014J  ...           Y         Pass
32591         GGG             2014J  ...           N    Withdrawn
32592         GGG             2014J  ...           N  Distinction

[32593 rows x 12 columns]


In [4]:
tabs = pd.crosstab(df.highest_education, df.final_result)

In [6]:
tabs

final_result,Distinction,Fail,Pass,Withdrawn
highest_education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A Level or Equivalent,1496,2707,5812,4030
HE Qualification,697,790,1960,1283
Lower Than A Level,727,3426,4385,4620
No Formal quals,16,95,87,149
Post Graduate Qualification,88,34,117,74


In [10]:
table = sm.stats.Table(tabs)

In [12]:
table.table_orig

final_result,Distinction,Fail,Pass,Withdrawn
highest_education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A Level or Equivalent,1496,2707,5812,4030
HE Qualification,697,790,1960,1283
Lower Than A Level,727,3426,4385,4620
No Formal quals,16,95,87,149
Post Graduate Qualification,88,34,117,74


In [13]:
table.fittedvalues

final_result,Distinction,Fail,Pass,Withdrawn
highest_education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A Level or Equivalent,1303.104348,3038.853128,5326.611389,4376.431136
HE Qualification,438.852514,1023.408707,1793.867702,1473.871077
Lower Than A Level,1220.807904,2846.93695,4990.213788,4100.041359
No Formal quals,32.194888,75.078821,131.600865,108.125426
Post Graduate Qualification,29.040346,67.722394,118.706256,97.531004


In [14]:
table.resid_pearson

final_result,Distinction,Fail,Pass,Withdrawn
highest_education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A Level or Equivalent,5.343586,-6.019925,6.65065,-5.23669
HE Qualification,12.322777,-7.296129,3.922463,-4.971763
Lower Than A Level,-14.133,10.852678,-8.567404,8.120349
No Formal quals,-2.854201,2.299092,-3.887889,3.930876
Post Graduate Qualification,10.940925,-4.097814,-0.156606,-2.382699


H0 = highest_education and final_result are independent.

H1 = highest_education and final_result are not independent.

The significance level alpha = .05

In [20]:
def chi_sq_test(cross_tabs):
  """
  Prints the Chi-Squared Statistic, p-value, and degrees of freedom from a Chi-squared test.
  Args:
    cross_tabls: A crosstab dataframe.
  """
  chi2, p, dof, con_table = stats.chi2_contingency(cross_tabs)
  print(f'chi-squared = {chi2}\np value = {p}\ndegrees of freedom = {dof}')

In [21]:
chi_sq_test(tabs)

chi-squared = 1024.6961991440007
p value = 9.18211300726649e-212
degrees of freedom = 12


Since p value is less than alpha, we reject the null hypothesis and conclude that highest_education and final_result are not independent.

In [24]:
import numpy as np

In [22]:
def cramers_v(cross_tabs):
    """
    Prints the degrees of freedom, effect size thresholds, and Cramer's V value.
    
    Args:
        cross_tabs: A crosstab dataframe.
    """
    
    # effect size data frame for cramer's v function
    data = np.array([[1, .1, .3, .5],
       [2, .07, .21, .35],
       [3, .06, .17, .29],
       [4, .05,.15,.25],
       [5, .04, .13, .22]])
    sizes = pd.DataFrame(data, columns=['Degrees of Freedom', 'Small Effect', 'Medium Effect', 'Large Effect']) 
    
    # getting the chi sq. stat
    chi2 = stats.chi2_contingency(cross_tabs)[0]    # calculating the total number of observations
    n = cross_tabs.sum().sum()    # getting the degrees of freedom
    dof = min(cross_tabs.shape)-1    # calculating cramer's v
    v = np.sqrt(chi2/(n*dof))    # printing results
    print(f'V = {v}')
    print(f'Cramer\'s V Degrees of Freedom = {dof}')
    print(f'\nEffect Size Thresholds\n{sizes}\n')

In [25]:
cramers_v(tabs)

V = 0.10237048644403951
Cramer's V Degrees of Freedom = 3

Effect Size Thresholds
   Degrees of Freedom  Small Effect  Medium Effect  Large Effect
0                 1.0          0.10           0.30          0.50
1                 2.0          0.07           0.21          0.35
2                 3.0          0.06           0.17          0.29
3                 4.0          0.05           0.15          0.25
4                 5.0          0.04           0.13          0.22



In [30]:
tabs.shape

(5, 4)

In [31]:
tabs2 = pd.crosstab(df.disability, df.final_result)

In [32]:
tabs2

final_result,Distinction,Fail,Pass,Withdrawn
disability,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,2801,6340,11377,8911
Y,223,712,984,1245


In [33]:
chi_sq_test(tabs2)

chi-squared = 138.45398732595984
p value = 8.143196121271435e-30
degrees of freedom = 3


In [34]:
cramers_v(tabs2)

V = 0.06517642952859402
Cramer's V Degrees of Freedom = 1

Effect Size Thresholds
   Degrees of Freedom  Small Effect  Medium Effect  Large Effect
0                 1.0          0.10           0.30          0.50
1                 2.0          0.07           0.21          0.35
2                 3.0          0.06           0.17          0.29
3                 4.0          0.05           0.15          0.25
4                 5.0          0.04           0.13          0.22



In [35]:
tabs2.shape

(2, 4)

In [36]:
tabs3 = pd.crosstab(df.final_result, df.disability)

In [40]:
tabs3

disability,N,Y
final_result,Unnamed: 1_level_1,Unnamed: 2_level_1
Distinction,2801,223
Fail,6340,712
Pass,11377,984
Withdrawn,8911,1245


In [38]:
chi_sq_test(tabs3)

chi-squared = 138.45398732595984
p value = 8.143196121271435e-30
degrees of freedom = 3


In [39]:
cramers_v(tabs3)

V = 0.06517642952859402
Cramer's V Degrees of Freedom = 1

Effect Size Thresholds
   Degrees of Freedom  Small Effect  Medium Effect  Large Effect
0                 1.0          0.10           0.30          0.50
1                 2.0          0.07           0.21          0.35
2                 3.0          0.06           0.17          0.29
3                 4.0          0.05           0.15          0.25
4                 5.0          0.04           0.13          0.22



In [41]:
tabs4 = pd.crosstab(df.final_result, df.disability, normalize = 'index')

In [42]:
tabs4

disability,N,Y
final_result,Unnamed: 1_level_1,Unnamed: 2_level_1
Distinction,0.926257,0.073743
Fail,0.899036,0.100964
Pass,0.920395,0.079605
Withdrawn,0.877412,0.122588


In [45]:
tabs5 = pd.crosstab(df.final_result, df.disability, normalize = 'columns')

In [46]:
tabs5

disability,N,Y
final_result,Unnamed: 1_level_1,Unnamed: 2_level_1
Distinction,0.095178,0.07048
Fail,0.215434,0.225032
Pass,0.386591,0.310999
Withdrawn,0.302797,0.393489


In [47]:
tabs6 = pd.crosstab(df.final_result, df.disability, normalize = 'all')

In [48]:
tabs6

disability,N,Y
final_result,Unnamed: 1_level_1,Unnamed: 2_level_1
Distinction,0.085939,0.006842
Fail,0.19452,0.021845
Pass,0.349063,0.030191
Withdrawn,0.273402,0.038198
