In [2]:
from sklearn import datasets
from scipy import stats
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [7]:
df = pd.read_csv("toy_data/test.csv")
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Chi-Square Tests

Certainly! Here’s a more detailed look at the purposes of each type of Chi-Square test:

### 1. **Chi-Square Test of Independence** [Refer this Blog](https://www.jmp.com/en_in/statistics-knowledge-portal/chi-square-test/chi-square-test-of-independence.html#:~:text=What%20is%20the%20Chi%2Dsquare,to%20be%20related%20or%20not.)
- **Purpose:** To determine if there is a significant association or relationship between two categorical variables.
- **Scenario:** You have two categorical variables and want to assess whether the distribution of one variable depends on the categories of the other variable. Essentially, it tests if the variables are independent of each other.
- **Example:** Suppose you want to study if there is a relationship between gender (male, female) and preference for a particular type of product (e.g., electronics, clothing). You would use this test to see if the distribution of product preference is independent of gender or if it varies by gender.

### 2. **Chi-Square Test of Homogeneity**
- **Purpose:** To compare the distribution of a categorical variable across different populations or groups to see if they have the same distribution.
- **Scenario:** You have multiple groups and want to test if the categorical variable is distributed the same way in each group. This is similar to the test of independence but focuses on comparing distributions across different populations rather than within a single population.
- **Example:** You might want to examine if the distribution of a certain preference (e.g., types of transportation: car, bike, bus) is the same among different age groups (e.g., teens, adults, seniors). This test helps determine if the preference patterns are homogeneous across these age groups.

### 3. **Chi-Square Goodness-of-Fit Test**
- **Purpose:** To test whether the observed frequencies of a single categorical variable match an expected or theoretical distribution.
- **Scenario:** You have one categorical variable and want to check if the observed distribution fits a specific expected distribution. This test compares the observed data against a hypothesized distribution to see if they align.
- **Example:** You might want to see if the observed number of customers preferring different flavors of ice cream (chocolate, vanilla, strawberry) fits a theoretical distribution where each flavor is equally likely. For this, you compare the observed counts with the expected counts under the assumption of equal preference.

**Summary:**
- **Chi-Square Test of Independence:** Tests if two categorical variables are related or independent within a single population.
- **Chi-Square Test of Homogeneity:** Tests if different populations or groups have the same distribution for a categorical variable.
- **Chi-Square Goodness-of-Fit Test:** Tests if the distribution of a single categorical variable matches a specified expected distribution.

In [8]:
# chi squre test of independence
def chisquare_independence_test(df, x_col, y_col):
    """
    The Chi-square test of independence is a statistical hypothesis test used to 
    determine whether two categorical or nominal variables are likely to be related or not.
    """
    h_0=f"H0: no relation btween {x_col} & {y_col}" # both are independent
    h_1=f"H1: relation btween {x_col} & {y_col}" # there is a dependency
    contingency_table = pd.crosstab(df[x_col], df[y_col])
    # Perform Chi-square test of independence
    chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
    result = ""
    if np.round(p_value, 2) < 0.05:
        result = "Reject null hypothesis"
    else:
        result = "Fail to reject null hypothesis"
    return {"h0":h_0, "h1":h_1,"p_val":p_value, "cc":result}

In [None]:
# chi squre test of independence
def chisquare_homoginity_test(df, x_col, y_col):
    """
    The Chi-square test of independence is a statistical hypothesis test used to 
    determine whether two categorical or nominal variables are likely to be related or not.
    """
    h_0=f"H0: no relation btween {x_col} & {y_col}" # both are independent
    h_1=f"H1: relation btween {x_col} & {y_col}" # there is a dependency
    contingency_table = pd.crosstab(df[x_col], df[y_col])
    # Perform Chi-square test of independence
    chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
    result = ""
    if np.round(p_value, 2) < 0.05:
        result = "Reject null hypothesis"
    else:
        result = "Fail to reject null hypothesis"
    return {"h0":h_0, "h1":h_1,"p_val":p_value, "cc":result}