In [2]:
#dependencies
import re
import pandas as pd
import numpy as np
import scipy.stats as stats

import json
# import xlsxwriter

### Non Parametric Tests

In [None]:
# read ordinal data sheet (remember one IV = Employment status is nominal and rest are ordinal, All Dvs are ordinal)

workbooks = ['SampleDataOrdinal.xlsx', 'SampleDataNominal.xlsx']

for data_workbook in workbooks:

    df_result = ONStatisticTests(data_workbook)
    # change independent_data_type , dependent_data_type in the META BLOCK or add code to decipher the variable value

    df_result.to_excel('results' + data_workbook[:-5] + '.xlsx', index=False)

### Helper Generic Functions

In [None]:
def getMetadata(df_temp, independent_data_type = 'ordinal', dependent_data_type = 'ordinal'):
        
    '''
    df_temp -  is data frame 
    independent_data_type - is the independent data ordinal or nominal. Default = ordinal
    dependent_data_type - is the dependent data ordinal or nominal. Default = ordinal
    
    Returns - a list of meta data '''

    # EXTRACT METADATA

    # get unique values to cal dof. dof = (r-1)(c-1)
    IV_unique = len(df_temp[df_temp.columns[0]].unique())                               # Unique IV
    DV_unique = len(df_temp[df_temp.columns[1]].unique())                               # Unique DV

    # get data type of the IV and DV
    IV_dataType = independent_data_type                                                 # IV dataType
    DV_dataType = dependent_data_type                                                   # DV dataType

    # if you drop some rows as they dont have data that makes sense in an order , like 'don't know' and 'prefer not to say'
    # Num_rows = int(df_temp.shape[0])                                                  # Number of rows
    # Num_rows_dropped = int(originalRowNum - Num_rows)                                 # Number of rows lost

    return IV_unique ,DV_unique, IV_dataType, DV_dataType



def cramerV_interpretation(cramer_dof, number):

    '''Interpret cramer V 
    cramer_dof - dof for cramer V min( row-1 , column -1 ) of contigency table
    number - cramer V stat'''

    cramer_table = { 1 : {'weak' : (0, 0.10) , 'moderate' : (0.10, 0.30), 'strong' : (0.30, 0.50), 'very strong': (0.50, 1)}, 
                    2 : {'weak' : (0, 0.07) , 'moderate' : (0.07, 0.21), 'strong' : (0.21, 0.35), 'very strong': (0.35, 1)},
                    3 : {'weak' : (0, 0.06) , 'moderate' : (0.06, 0.17), 'strong' : (0.17, 0.29), 'very strong': (0.29, 1)}, 
                    4 : {'weak' : (0, 0.05) , 'moderate' : (0.05, 0.15), 'strong' : (0.15, 0.25), 'very strong': (0.25, 1)},
                    5 : {'weak' : (0, 0.04) , 'moderate' : (0.04, 0.13), 'strong' : (0.13, 0.22), 'very strong': (0.22, 1)}}

    if number <=0 :
        return 'no assosiation'

    for effect, range_tuple in cramer_table[cramer_dof].items():
        lower_bound, upper_bound = range_tuple
        if lower_bound <= number < upper_bound:
            return effect

    return 'value could not be found - check the cramerV table'


def pearsonsChi2Test(df_temp, IV_dataType, DV_dataType, significant_val):
        
    ''' PEARSON'S CHI2 TEST - for nominal DV and nominal IV
    df_temp - your dataframe with one IV and one DV
    IV_datatype - is it ordinal or nominal
    DV_ datatype - is it ordinal or nominal
    significant_val - alpha under the Null Hypothesis as a number'''

    # prepare data
    chi_validity = "valid as DV is nominal and IV is nominal" if DV_dataType == "nominal" and IV_dataType == "nominal" else "not valid as either DV or IV is ordinal"        # Chi2 validity

    chi_contingency_table = pd.crosstab(df_temp[df_temp.columns[0]], df_temp[df_temp.columns[1]]) # do not use margins = True as it corrupts the result
    observed = chi_contingency_table.values

    # Calculate row and column totals
    row_totals = observed.sum(axis=1)
    column_totals = observed.sum(axis=0)

    # Expected values
    expected = np.outer(row_totals, column_totals)/chi_contingency_table.sum().sum()
    chi_meet_condition = "yes as expected values more than 5" if (np.asarray(expected) < 5).sum() <= 0 else "no as expected values less than 5"       # Meets condition


    #it accepts both Rated values and original string values - treats both their values as continuous 
    chi2, chi_pvalue, chi_dof = stats.contingency.chi2_contingency(chi_contingency_table, correction = False)[:3]                                     # Chi correlation, Chi P value, Chi dof
    chi_significance = 'significant' if chi_pvalue < significant_val else 'not significant'


    # Calculate Cramer V coefficient - calculate V when youâ€™re working with any table larger than a 2 x 2 contingency table
    cramer_dof = min(chi_contingency_table.shape) - 1
    effect_size_cramer_V = np.sqrt(chi2 / (cramer_dof * chi_contingency_table.sum().sum()))                                                           # Cramer V coeff
    cramerV_interpret = cramerV_interpretation(cramer_dof, effect_size_cramer_V)                                                                      # stat interpretation

    return chi_validity, chi_meet_condition, chi2, chi_pvalue, chi_significance, chi_dof, effect_size_cramer_V, cramerV_interpret, chi_contingency_table



def ordinalChi2Test(df_temp, contingency_table, DV_dataType, IV_dataType, IV_unique, DV_unique, chi2, chi_dof, significant_val):

    '''LINERA TO LINEAR TEST (ORDINAL CHI SQUARE TEST) + KENDALL TAU
    df_temp - your dataframe with one IV and one DV
    contingency_table - contigency table computed for IV and DV without margins
    IV_datatype - is it ordinal or nominal
    DV_ datatype - is it ordinal or nominal
    IV_unique, DV_unique - uniques categories in IV and DV
    chi2 - as computed in pearsonsChi2Test function
    chi_dof - as computed in pearsonsChi2Test function
    significant_val - alpha under the Null Hypothesis as a number'''

    # prepare data
    o_chi_validity = "valid as DV and IV are ordinal" if (DV_dataType == "ordinal" and IV_dataType == "ordinal") else "invalid as either DV or IV is nominal"             # Linear 2 Linear validity

    if o_chi_validity == "invalid as either DV or IV is nominal" :

        print(' returning nans')

        return [np.nan] * 16

    else:

        # M2 = chi-square statistic on 1 degree of freedom based on pearson's r . The reasource used pearson's
        df = 1
        o_chi_pearsonsr_stat, _ = stats.pearsonr(df_temp['Rating IV'], df_temp['Rating DV']) # use ranks and not the categorical values                                                   
        m2_p = (contingency_table.sum().sum() - 1) * o_chi_pearsonsr_stat**2                                                                                                # Ochi stat (pearsons) dof = 1
        o_chi_p_pvalue = 1 - stats.chi2.cdf(m2_p, df = df)                                                                                                                  # Ochi P value (pearson's)
        o_chi_p_significance = 'significant' if o_chi_p_pvalue < significant_val else 'not significant'                                                                     # Ochi P val intrepretation (pearsons)


        # M2 = chi-square statistic on 1 degree of freedom based on kendalltau r 
        kendalltau_validity = "valid as DV and IV are ordinal" if DV_dataType == "ordinal" and IV_dataType == "ordinal" else "invalid as either DV or IV is nominal"        # Kendall tau validity
        variant = 'b' if IV_unique == DV_unique else 'c'
        o_chi_kendalltau_stat, kt_pvalue = stats.kendalltau(df_temp['Rating IV'], df_temp['Rating DV'], variant = variant) # use ranks and not the categorical values       # Kendall tau stat, Kendall tau P value
        kendalltau_significance = 'significant' if kt_pvalue < significant_val else 'not significant'                                                                       # Kendall tau significance
        kendalltau_effectsize = kendalltau_interpretation(o_chi_kendalltau_stat)                                                                                            # Kendall tau Effect size

        m2_k = (contingency_table.sum().sum() - 1) * o_chi_kendalltau_stat**2                                                                                               # Ochi stat (kendall) dof = 1
        o_chi_k_pvalue = 1 - stats.chi2.cdf(m2_k, df = df)                                                                                                                  # Ochi P value (kendall)
        o_chi_k_significance = 'significant' if o_chi_k_pvalue < significant_val else 'not significant'                                                                     # Ochi P val interpretation (kendall)

        # Deviation from linear
        chi_deviation_from_linear = abs(m2_p - chi2)                                                                                                                        # Deviation from linear
        chi_deviation_dof = chi_dof - 1                                                                                                                                     # Deviation dof
        deviation_pvalue = stats.chi2.cdf(chi_deviation_from_linear, df = chi_deviation_dof)                                                                                # Deviation P value
        deviation_significance = 'significant' if deviation_pvalue < significant_val else 'not significant'                                                                 # Deviation P val interpretation

        return o_chi_validity, m2_p, o_chi_p_pvalue, o_chi_p_significance, m2_k, o_chi_k_pvalue, o_chi_k_significance, \
            chi_deviation_from_linear, chi_deviation_dof, deviation_pvalue, deviation_significance, \
                kendalltau_validity, o_chi_kendalltau_stat, kendalltau_effectsize, kt_pvalue, kendalltau_significance


def kendalltau_interpretation(number):

    '''Interpret Kendal tau 
    number - Kendal tau stat'''

    number = abs(number)

    kendall_table = {'very weak' : (0, 0.10) , 'weak' : (0.10, 0.20), 'moderate' : (0.20, 0.30), 'very strong': (0.30, 1)}

    if number == 0 :
        return 'no assosiation'

    for effect, range_tuple in kendall_table.items():
        lower_bound, upper_bound = range_tuple
        if lower_bound <= number < upper_bound:
            return effect

    return 'value could not be found - check the kendall table'



def kruskalTest(df_temp, IV_dataType, DV_dataType, contingency_table, significant_val): 
        
    '''KRUSKAL WALLIS - independent variable is treated as nominal
    df_temp - your dataframe with one IV and one DV
    contingency_table - contigency table computed for IV and DV without margins
    IV_datatype - is it ordinal or nominal
    DV_ datatype - is it ordinal or nominal
    significant_val - alpha under the Null Hypothesis as a number'''

    kruskal_validity = "valid as DV is ordinal" if DV_dataType == "ordinal" else "not valid as DV is nominal"  # validity

    if kruskal_validity == "not valid as DV is nominal":

        print(' returning nans')

        return [np.nan] * 5

    else:

        groups = [df_temp[df_temp['Rating IV'] == val]['Rating DV'] for val in df_temp['Rating IV'].unique()]
        kruskal_stat, kruskal_pvalue = stats.kruskal(*groups)                                                                               # Kruskal stat, Kruskal P value
        kruskal_significance = 'significant' if kruskal_pvalue < significant_val else 'not significant'                                     # Kruskal p value interpretation

        count = contingency_table.sum().sum()
        kruskal_effectsize = epsilon2_interpretation(kruskal_stat, count)                                                                   # Kruskal effect size interpretation

        return kruskal_validity, kruskal_stat, kruskal_pvalue, kruskal_significance, kruskal_effectsize



def epsilon2_interpretation(num, count):

    '''Interpret Kriuskal Wallis 
    number - Kriuskal Wallis stat'''

    num = abs(num)

    number = (num * (count +1))/ (count**2 - 1)

    epsilon_table = {'very weak' : (0, 0.01) , 'weak' : (0.01, 0.04), 'moderate' : (0.04, 0.16), 'relatively strong': (0.16, 0.36), 'strong':(0.36, 0.64), 'very strong': (0.64, 1.00) }

    if number == 0 :
        return 'no assosiation'

    for effect, range_tuple in epsilon_table.items():
        lower_bound, upper_bound = range_tuple
        if lower_bound <= number < upper_bound:
            return effect

    return 'value could not be found - check the epsilon table'


def ONStatisticTests(data_workbook):

    # Data is excel file with sheets containing combination of IV and DV and their scores

    df_result = pd.DataFrame(columns=["Unique IV", "Unique DV", "IV dataType", "DV dataType",  \
                            "Chi2 validity", "Meets condition", 'Chi stat', 'Chi P value', "Chi significance", "Chi dof", "Effect size Cramer V coeff", "Chi2 Effect size interpretation",\
                            "Linear 2 Linear validity", "Ochi stat (pearsons) dof = 1",  "Ochi P value (pearson's)", "Ochi P value (pearson's) Significance", \
                                    "Ochi stat (kendall) dof = 1", "Ochi P value (kendall)", "Ochi P value (kendall's) Significance", \
                                            "Deviation from linear", "Deviation dof", "Deviation P value", "Deviation P value Significance", \
                                                    "Kendall tau validity", "Kendall tau stat", "Kendall Effect size interpretation", "Kendall tau P value", "Kendall tau P value Significance", \
                                                            "Kruskal validity", "Kruskal stat", "Kruskal P value", "Kruskal significance", "Kruskal Effectsize interpretation"])

    significant_val = 0.05

    # load the sheet
    excel = pd.ExcelFile(data_workbook)

    # run test on each sheet 
    for sheet_name in excel.sheet_names: 

        df_temp = pd.read_excel(data_workbook, sheet_name=sheet_name)

        # META BLOCK
        IV_unique ,DV_unique, IV_dataType, DV_dataType = getMetadata(df_temp, independent_data_type = 'ordinal', dependent_data_type = 'ordinal')

        # CHI2 BLOCK - NN
        chi_validity, chi_meet_condition, chi2, chi_pvalue, chi_significance, chi_dof, effect_size_cramer_V, cramerV_interpret, \
                chi_contingency_table = pearsonsChi2Test(df_temp, IV_dataType, DV_dataType, significant_val)

        # L2L and KENDALL TAU BLOCK -OO
        o_chi_validity, m2_p, o_chi_p_pvalue, o_chi_p_significance, m2_k, o_chi_k_pvalue, o_chi_k_significance, chi_deviation_from_linear, chi_deviation_dof, \
                deviation_pvalue, deviation_significance, kendalltau_validity, o_chi_kendalltau_stat, kendalltau_effectsize, kt_pvalue, \
                        kendalltau_significance = ordinalChi2Test(df_temp, chi_contingency_table, DV_dataType, IV_dataType, IV_unique, DV_unique, chi2, chi_dof, significant_val)

        # KRUSKAL WALLIS BLOCK - NO
        kruskal_validity, kruskal_stat, kruskal_pvalue, kruskal_significance, kruskal_effectsize = kruskalTest(df_temp, IV_dataType, DV_dataType, chi_contingency_table, significant_val)


        lst = [IV_unique ,DV_unique, IV_dataType, DV_dataType, chi_validity, chi_meet_condition, chi2, chi_pvalue, chi_significance, chi_dof, effect_size_cramer_V, cramerV_interpret, \
        o_chi_validity, m2_p, o_chi_p_pvalue, o_chi_p_significance, m2_k, o_chi_k_pvalue, o_chi_k_significance, chi_deviation_from_linear, chi_deviation_dof, \
                deviation_pvalue, deviation_significance, kendalltau_validity, o_chi_kendalltau_stat, kendalltau_effectsize, kt_pvalue, \
                        kendalltau_significance, kruskal_validity, kruskal_stat, kruskal_pvalue, kruskal_significance, kruskal_effectsize]
        
        df_result.loc[len(df_result)] = lst

    return df_result
