## chi squared

In [1]:
import pandas as pd
import numpy as np
from math import sqrt

Let pk = proportion of patients in category k <br>
H0 : The amount of flu vaccine does not influence the incidence of flu <br>
i.e. p1 = p2 = p3 <br>

HA : The amount of flu vaccine does influence the incidence of flu <br>
i.e. p1 != p2 or p1 != p3 or p2 != p3 <br>
(at least one of the proportions are not equal).

Assumptions:
* The n observed counts are a random sample from the population of interest.
* The sample size n is large enough so that, for every cell, the expected count will be >= 5.

### df observed

In [2]:
df = pd.DataFrame({'observed': ['FF', 'FG', 'lab'],
                   'votes': [61,53,36]})
df.set_index('observed', inplace=True)
print("df_observed")
df

df_observed


Unnamed: 0_level_0,votes
observed,Unnamed: 1_level_1
FF,61
FG,53
lab,36


### df expected

In [3]:
## DATAFRME
df_expected = df.copy()

In [4]:
col_tot = df_expected.sum().round(4).to_dict()
print("col total",col_tot)
row_tot = df_expected.sum(axis=1).round(4).to_dict()
print("row total",row_tot)
g_tot = sum(row_tot.values())
print("grand total",g_tot)

col total {'votes': 150}
row total {'FF': 61, 'FG': 53, 'lab': 36}
grand total 150


In [5]:
for c in col_tot:
    for r in row_tot:
        df_expected.loc[r,c] = row_tot[r] * col_tot[c] / g_tot
print("df_expected")
df_expected

df_expected


Unnamed: 0_level_0,votes
observed,Unnamed: 1_level_1
FF,61
FG,53
lab,36


### df (O - E)

In [6]:
## DATAFRME
df_oe = df.copy()
df_oe = round((df_oe - df_expected),4)

In [7]:
print("df_O-E")
df_oe

df_O-E


Unnamed: 0_level_0,votes
observed,Unnamed: 1_level_1
FF,0
FG,0
lab,0


### df (O - E)^2

In [8]:
## DATAFRME
df_oe2 = df_oe.copy()
df_oe2 = round(df_oe.pow(2),4)

In [9]:
print("df_(O-E)2")
df_oe2

df_(O-E)2


Unnamed: 0_level_0,votes
observed,Unnamed: 1_level_1
FF,0
FG,0
lab,0


### df (O - E)^2 / E

In [10]:
## DATAFRME
df_oe2_E = df_oe2.copy()
df_oe2_E = round(df_oe2_E.div(df_expected),4)

In [11]:
print("df (O-E)2 / E")
df_oe2_E

df (O-E)2 / E


Unnamed: 0_level_0,votes
observed,Unnamed: 1_level_1
FF,0.0
FG,0.0
lab,0.0


## $X^2$

In [12]:
(df_oe2_E.sum()).sum()

0.0

## df = (number of rows - 1)*(number of columns - 1)
percent point in X2 table <br>
percent = $\alpha$ * 100

In [13]:
O = {'1':20,
     '2':27,
     '3':23,
     '4':30,
    '5':32,
    '6':21,
    '7':19,
    '8':28}

E = {'1':25,
     '2':25,
     '3':25,
     '4':25,
    '5':25,
    '6':25,
    '7':25,
    '8':25}

X = {}
for c in O.keys():
    X[c] = round(pow((O[c] - E[c]),2) / E[c],4)
    
print("X",X)
print("sum",round(sum(X.values()),4))

X {'1': 1.0, '2': 0.16, '3': 0.16, '4': 1.0, '5': 1.96, '6': 0.64, '7': 1.44, '8': 0.36}
sum 6.72
