In [1]:
import numpy as np
import pandas as pd

from scipy.stats import chi2_contingency


In [2]:
def chi_squared_test_cramers_features_output(data,alpha):
    
    '''
    This function calculates the chi-squared test between each categorical feature in the dataframe and the 
    output variable (column named 'target'). The test aims to determine whether they are independent or not. 
    The level of association between each variable and the 'target ' column, the cramer'v  value and the cramer'v degrees 
    of freedom are calculated.
    
    data is a dataframe containing categorial features and the output variable
    alpha is the significance for the test of hypothesis
    
    calculate Cramers V statistic for categorial-categorial association.
    uses correction from Bergsma and Wicher,
    Journal of the Korean Statistical Society 42 (2013): 323-328
    
    The output is a dataframe with the following columns:
    
    chi2       - calculated values of the chi squared test
    p          - p value
    dof        - degrees of freedom
    conclusion - the result of the chi-squared test indicating whether the variables are independent
    cramers_v  - cramer'v value
    cramers_df - cramers'v degrees of freedom
    '''
    
    features=data.columns
    features=features.delete(np.argwhere(features=='target'))
    results={'chi2':[],'p':[],'dof':[],'conclusion':[],'cramers_v':[],'cramers_df':[]}
    for i in features:
        x=data[[i,'target']].copy()
        x.dropna(inplace=True)
        columns=x.columns
        x=pd.crosstab(x[columns[0]],x[columns[1]])
        chi2, p, dof, con_table = chi2_contingency(x)
        n=x.values.sum()
        r, k = x.shape
        phi2 = chi2 / n
        phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
        rcorr = r - ((r-1)**2)/(n-1)
        kcorr = k - ((k-1)**2)/(n-1)
        cramers_dof=min(x.shape)-1
        cramers_v=np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
        results['chi2']      =results['chi2']+[chi2]
        results['p']         =results['p']   +[p]
        results['dof']       =results['dof']+[dof]
        results['cramers_v'] =results['cramers_v']+[cramers_v]
        results['cramers_df']=results['cramers_df']+[cramers_dof]

        if p<alpha/2:
            conclusion='Dependent'
        else:
            conclusion='Independent'
        results['conclusion']=results['conclusion']+[conclusion]
    results=pd.DataFrame(results,index=features)
    results.sort_values(by=['cramers_v'],ascending=False,inplace=True)
    return results
        
