# Chi-Square Test

In [1]:
#========================================================================================
# CODE NAME     : Chi-Square Test.py
# PURPOSE       : Demonstrate application of chi-square test using Python
# APPLICATION   : Analyzing Titanic Data
#=========================================================================================

In [1]:
# Load the input data
import numpy as np
import pandas as pd

titanic = pd.read_excel("C:\\Users\\Training\\Data Science using SAS and Python\\Data\\titanic.xlsx")
titanic.head()

Unnamed: 0,Name,Age,Gender,Class,Fare,Survived
0,"Allen, Miss. Elisabeth Walton",29.0,female,1,211.34,1
1,"Allison, Master. Hudson Trevor",1.0,male,1,151.55,1
2,"Allison, Miss. Helen Loraine",2.0,female,1,151.55,0
3,"Allison, Mr. Hudson Joshua Creighton",30.0,male,1,151.55,0
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0,female,1,151.55,0


In [2]:
# Format the Survived variable
condition = [titanic["Survived"] == 1, titanic["Survived"] == 0]
choice = ["Survived","Died"]
titanic["Survived"] = np.select(condition,choice,default = "NaN")
titanic.head()

Unnamed: 0,Name,Age,Gender,Class,Fare,Survived
0,"Allen, Miss. Elisabeth Walton",29.0,female,1,211.34,Survived
1,"Allison, Master. Hudson Trevor",1.0,male,1,151.55,Survived
2,"Allison, Miss. Helen Loraine",2.0,female,1,151.55,Died
3,"Allison, Mr. Hudson Joshua Creighton",30.0,male,1,151.55,Died
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0,female,1,151.55,Died


Chi-Square test for association between Gender and Survived, Class and Survived

In [3]:
def chi_sq(var1,var2):
    
    import scipy.stats as stats
    var1_by_var2 = pd.crosstab(var1,var2)
    chi_sq = stats.chi2_contingency(observed = var1_by_var2)
    chi_sq = pd.DataFrame(chi_sq, index = ["Chi-Sq","P-Value","DF","Expected Frequency"], columns = ["Value"])
    return chi_sq

# Chi-Sq test for Gender and Survived
chi_sq(titanic.Gender,titanic.Survived)

# Chi-Sq test for Class and Survived
# chi_sq(titanic.Class,titanic.Survived)

Unnamed: 0,Value
Chi-Sq,363.618
P-Value,4.58992e-81
DF,1
Expected Frequency,"[[288.00152788388084, 177.9984721161192], [520..."


# Exact p-values

In [7]:
# Load the Input Data
exact = pd.DataFrame({"A":[1,1,1,2,2,2,2],
                      "B":[2,2,2,1,1,2,2]})
exact

Unnamed: 0,A,B
0,1,2
1,1,2
2,1,2
3,2,1
4,2,1
5,2,2
6,2,2


In [8]:
a_by_b = pd.crosstab(exact.A,exact.B)
a_by_b

B,1,2
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,3
2,2,2


In [11]:
chi_sq(exact.A,exact.B)

Unnamed: 0,Value
Chi-Sq,0.364583
P-Value,0.545972
DF,1
Expected Frequency,"[[0.8571428571428571, 2.142857142857143], [1.1..."


Get the Fisher's Exact p-values

In [8]:
import scipy.stats as stats
oddsratio,p_value = stats.fisher_exact(a_by_b)
print("Fisher's Exact p-value: ", p_value.round(4))

Fisher's Exact p-value:  0.4286


# Ordinal Association between CLASS and SURVIVED Variables

In [6]:
import statsmodels.api as sm
result = sm.stats.Table(titanic[["Class","Survived"]])
print(result.test_ordinal_association())

null_mean   669448.8335039017
null_sd     9330.518536057307
pvalue      7.1250095705999446e-34
statistic   556248.5
zscore      -12.132266075721825


In [10]:
# Get the strength of ordinal association using Spearman Correlation
titanic[["Class","Survived"]].corr(method = 'spearman').round(4)

Unnamed: 0,Class,Survived
Class,1.0,-0.3097
Survived,-0.3097,1.0
