# Implementation of Pearson's chi squared test

In [1]:
import pandas as pd
import numpy as np
from scipy import stats as ss

In [2]:
hr_data = pd.read_csv('HRDataset_v9.csv')
hr_data.head()

Unnamed: 0,Employee Name,Employee Number,MarriedID,MaritalStatusID,GenderID,EmpStatus_ID,DeptID,Perf_ScoreID,Age,Pay Rate,...,Date of Hire,Days Employed,Date of Termination,Reason For Term,Employment Status,Department,Position,Manager Name,Employee Source,Performance Score
0,"Brown, Mia",1103024456,1,1,0,1,1,3,30,28.5,...,10/27/2008,3317,,N/A - still employed,Active,Admin Offices,Accountant I,Brandon R. LeBlanc,Diversity Job Fair,Fully Meets
1,"LaRotonda, William",1106026572,0,2,1,1,1,3,34,23.0,...,1/6/2014,1420,,N/A - still employed,Active,Admin Offices,Accountant I,Brandon R. LeBlanc,Website Banner Ads,Fully Meets
2,"Steans, Tyrone",1302053333,0,0,1,1,1,3,31,29.0,...,9/29/2014,1154,,N/A - still employed,Active,Admin Offices,Accountant I,Brandon R. LeBlanc,Internet Search,Fully Meets
3,"Howard, Estelle",1211050782,1,1,0,1,1,9,32,21.5,...,2/16/2015,58,4/15/2015,N/A - still employed,Active,Admin Offices,Administrative Assistant,Brandon R. LeBlanc,Pay Per Click - Google,N/A- too early to review
4,"Singh, Nan",1307059817,0,0,0,1,1,9,30,16.56,...,5/1/2015,940,,N/A - still employed,Active,Admin Offices,Administrative Assistant,Brandon R. LeBlanc,Website Banner Ads,N/A- too early to review


### Contigency table

In [3]:
cross = pd.crosstab(hr_data['Manager Name'], hr_data['Performance Score'])
cross

Performance Score,90-day meets,Exceeds,Exceptional,Fully Meets,N/A- too early to review,Needs Improvement,PIP
Manager Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alex Sweetwater,2,1,1,4,0,1,0
Amy Dunn,1,2,1,11,5,0,1
Board of Directors,0,0,0,2,0,0,0
Brandon R. LeBlanc,1,0,0,4,2,0,0
Brannon Miller,1,5,2,8,0,1,4
Brian Champaigne,0,0,0,8,0,0,0
David Stanley,1,1,0,15,4,0,0
Debra Houlihan,0,0,0,2,0,1,0
Elijiah Gray,3,2,0,13,3,1,0
Eric Dougall,0,1,0,3,0,0,0


In [4]:
cross.columns

Index(['90-day meets', 'Exceeds', 'Exceptional', 'Fully Meets',
       'N/A- too early to review', 'Needs Improvement', 'PIP'],
      dtype='object', name='Performance Score')

In [6]:
cross = cross.drop('N/A- too early to review', 1)
cross

Performance Score,90-day meets,Exceeds,Exceptional,Fully Meets,Needs Improvement,PIP
Manager Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alex Sweetwater,2,1,1,4,1,0
Amy Dunn,1,2,1,11,0,1
Board of Directors,0,0,0,2,0,0
Brandon R. LeBlanc,1,0,0,4,0,0
Brannon Miller,1,5,2,8,1,4
Brian Champaigne,0,0,0,8,0,0
David Stanley,1,1,0,15,0,0
Debra Houlihan,0,0,0,2,1,0
Elijiah Gray,3,2,0,13,1,0
Eric Dougall,0,1,0,3,0,0


In [10]:
cross.shape

(21, 6)

In [7]:
ss.chi2_contingency(cross)

(104.19464939881554,
 0.3670945067181692,
 100,
 array([[ 1.02197802,  0.92307692,  0.2967033 ,  5.96703297,  0.49450549,
          0.2967033 ],
        [ 1.81684982,  1.64102564,  0.52747253, 10.60805861,  0.87912088,
          0.52747253],
        [ 0.22710623,  0.20512821,  0.06593407,  1.32600733,  0.10989011,
          0.06593407],
        [ 0.56776557,  0.51282051,  0.16483516,  3.31501832,  0.27472527,
          0.16483516],
        [ 2.38461538,  2.15384615,  0.69230769, 13.92307692,  1.15384615,
          0.69230769],
        [ 0.90842491,  0.82051282,  0.26373626,  5.3040293 ,  0.43956044,
          0.26373626],
        [ 1.93040293,  1.74358974,  0.56043956, 11.27106227,  0.93406593,
          0.56043956],
        [ 0.34065934,  0.30769231,  0.0989011 ,  1.98901099,  0.16483516,
          0.0989011 ],
        [ 2.15750916,  1.94871795,  0.62637363, 12.5970696 ,  1.04395604,
          0.62637363],
        [ 0.45421245,  0.41025641,  0.13186813,  2.65201465,  0.21978022,
     

 $$ chi\ squared\ test\ statistics = 104.19464939881554$$<br>
 $$p\ value\ =\ 0.3670945067181692$$<br>
 $$degree\ of\ freedom,(dof)\ =\ 100$$
> The array represents the Expected Outcome<br>
> Since the `p-value` is greater than $0.05$, we fail to reject the null hypothesis and conclude that there is no relationship between the Performance score and Manager Name (who an employee works for).

In [8]:
def cramers_corrected_stat(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

In [11]:
cramers_corrected_stat(cross)

0.053443126803626846

> The cramer's V value is closer to zero(0), meaning there is little to no relationship between the two variables.