<img src='img/Screen Shot 2021-10-28 at 1.12.39 AM.png'>

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
O = np.array([[25,30,25],[30,71,19],[35,49,16]])
df_observed = pd.DataFrame(O, index=['A','B','C'])
df_observed.columns = ['21-40','41-60','61-80']
df_observed

Unnamed: 0,21-40,41-60,61-80
A,25,30,25
B,30,71,19
C,35,49,16


In [3]:
statistic, p_value, degree_of_freedom, expected = stats.chi2_contingency(O)
print('Statistic : ', statistic)
print('p Value : ', p_value)

Statistic :  13.315833333333334
p Value :  0.009831372848336144


$$
\displaystyle
\chi^2_{df}=\sum_{i=1}^{n_i}\sum_{j=1}^{n_j}\frac{(O_{ij}-E_{ij})^2}{E_{ij}},\quad df=(n_i-1)(n_j-1)
$$

In [3]:
df_observed.loc['ColumnSum',:] = df_observed.sum(axis=0) 
df_observed['RowSum'] = df_observed.sum(axis=1) 
df_observed

Unnamed: 0,21-40,41-60,61-80,RowSum
A,25.0,30.0,25.0,80.0
B,30.0,71.0,19.0,120.0
C,35.0,49.0,16.0,100.0
ColumnSum,90.0,150.0,60.0,300.0


$$\begin{array}{llllll}
p_i&=&\frac{O_{i\cdot}}{O_{\cdot\cdot}}&=&\frac{\sum_{j'=1}^{n_j}O_{ij'}}{\sum_{i'=1}^{n_i}\sum_{j'=1}^{n_j}O_{i'j'}}\\
\\
p_j&=&\frac{O_{\cdot j}}{O_{\cdot\cdot}}&=&\frac{\sum_{i'=1}^{n_i}O_{i'j}}{\sum_{i'=1}^{n_i}\sum_{j'=1}^{n_j}O_{i'j'}}\\
\\
E_{ij}&=&p_ip_jO_{\cdot\cdot}
\end{array}$$

In [4]:
df_expected = df_observed.copy()
total = df_observed.loc['ColumnSum','RowSum']
for row in df_expected.index[:-1]: 
    p_row = df_observed.loc[row,'RowSum'] / total  
    for column in df_expected.columns[:-1]: 
        p_column = df_observed.loc['ColumnSum',column] / total
        df_expected.loc[row,column] = p_row * p_column * total
df_expected

Unnamed: 0,21-40,41-60,61-80,RowSum
A,24.0,40.0,16.0,80.0
B,36.0,60.0,24.0,120.0
C,30.0,50.0,20.0,100.0
ColumnSum,90.0,150.0,60.0,300.0


In [5]:
df_O = df_observed.iloc[:-1,:-1]
df_E = df_expected.iloc[:-1,:-1]
statistic = np.sum(((df_O - df_E)**2 / df_E).values)
statistic

13.315833333333337

In [6]:
degree_of_freedom = (O.shape[0] - 1) * (O.shape[1] - 1)
degree_of_freedom

4

In [7]:
p_value = 1 - stats.chi2(degree_of_freedom).cdf(statistic)
p_value

0.009831372848336128

# Reference

[Understandable_Statistics](https://www.youtube.com/watch?v=RvGb5L_A16w&list=PL4BztsgicSEeC4Oic6s5vW4LE-0YZTuo-&index=18&t=0s)

[scipy.stats.chi2_contingency](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html)