FUNCTION FOR CHI-SQUARE TEST

In [35]:
import pandas as pd
import scipy.stats as stats
df= pd.read_csv("C:/Users/al6436/Desktop/Vrinda Store Data/VrindaStore.csv")
def chi_square_test(df, column1, column2, alpha=0.05):
    contingency_table = pd.crosstab(df[column1], df[column2])
    chi2_stat, p_value, dof, expected = stats.chi2_contingency(contingency_table)
    
    critical_value = stats.chi2.ppf(1 - alpha, dof)

    print(f"Chi-Square Statistic: {chi2_stat}")
    print(f"p-value: {p_value}")
    print(f"Degrees of Freedom: {dof}")
    print(f"Critical Value: {critical_value}")
    
    if chi2_stat > critical_value:
        return f"Reject the null hypothesis: The variables {column1} and {column2} are dependent (related)."
    else:
        return f"Fail to reject the null hypothesis: The variables {column1} and {column2} are independent."


In [36]:
result = chi_square_test(df, 'Gender', 'Category')
print(result)

Chi-Square Statistic: 12417.426988774592
p-value: 0.0
Degrees of Freedom: 7
Critical Value: 14.067140449340169
Reject the null hypothesis: The variables Gender and Category are dependent (related).


In [37]:
result = chi_square_test(df,'Size','Category')
print(result)

Chi-Square Statistic: 31586.462861369724
p-value: 0.0
Degrees of Freedom: 70
Critical Value: 90.53122543488065
Reject the null hypothesis: The variables Size and Category are dependent (related).



FUNCTION FOR Z-TEST


In [38]:
import numpy as np
def z_test_two_sample(sample1, sample2, std1, std2, alpha=0.05):
    mean1 = np.mean(sample1)
    mean2 = np.mean(sample2)
    n1 = len(sample1)
    n2 = len(sample2)
    
    z_score = (mean1 - mean2) / np.sqrt((std1 ** 2 / n1) + (std2 ** 2 / n2))
    
    p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))
    critical_value = stats.norm.ppf(1 - alpha / 2)
    
    print(f"Z-score: {z_score:.2f}")
    print(f"p-value: {p_value:.2f}")
    print(f"Critical Value: {critical_value:.2f}")
    
    if abs(z_score) > critical_value:
        return f"Reject the null hypothesis: The means of the two samples are significantly different."
    else:
        return f"Fail to reject the null hypothesis: The means of the two samples are not significantly different."


In [39]:
men_data = df[df['Gender'] == 'Men']['Amount']
women_data = df[df['Gender'] == 'Women']['Amount']
std_men = 232.0668908
std_women = 266.5992774
result = z_test_two_sample(men_data, women_data, std_men, std_women)
print(result)

Z-score: 57.65
p-value: 0.00
Critical Value: 1.96
Reject the null hypothesis: The means of the two samples are significantly different.


In [69]:
B2B = df[df['B2B'] == True]['Amount']
B2C = df[df['B2B'] == False]['Amount']
std_B2B = 271.23
std_B2C = 268.57
result = z_test_two_sample(B2B, B2C, std_B2C, std_B2B)
print(result)

Z-score: -0.15
p-value: 0.88
Critical Value: 1.96
Fail to reject the null hypothesis: The means of the two samples are not significantly different.



FUNCTION FOR ONE WAY ANOVA


In [51]:
import pandas as pd
import scipy.stats as stats

def one_way_anova(df, group_column, value_column, alpha=0.05):
    
        groups = [df[df[group_column] == group][value_column].values for group in df[group_column].unique()]
        f_statistic, p_value = stats.f_oneway(*groups)
        print(f"F-statistic: {f_statistic:.2f}")
        print(f"p-value: {p_value:.2f}")
        k = len(df[group_column].unique()) 
        N = len(df) 
        df_between = k - 1 
        df_within = N - k   
        f_critical = stats.f.ppf(1 - alpha, df_between, df_within)
        print(f"Critical F-value: {f_critical:.2f}")

        if p_value < alpha:
            return f"Reject the null hypothesis: The average {value_column} differs across the {group_column} groups."
        else:
            return f"Fail to reject the null hypothesis: The average {value_column} does not differ significantly across the {group_column} groups."

In [52]:
result = one_way_anova(df, 'Status', 'Amount')
print(result)

F-statistic: 123.67
p-value: 0.00
Critical F-value: 2.61
Reject the null hypothesis: The average Amount differs across the Status groups.


In [53]:
result = one_way_anova(df, 'Group', 'Amount')
print(result)

F-statistic: 0.24
p-value: 0.78
Critical F-value: 3.00
Fail to reject the null hypothesis: The average Amount does not differ significantly across the Group groups.



FUNCTION FOR T TEST 


In [71]:
import numpy as np
from scipy import stats

def t_test_two_sample(sample1, sample2, alpha=0.05):
    mean1 = np.mean(sample1)
    mean2 = np.mean(sample2)
    n1 = len(sample1)
    n2 = len(sample2)
    var1 = np.var(sample1, ddof=1)
    var2 = np.var(sample2, ddof=1)
    
    t_score = (mean1 - mean2) / np.sqrt((var1 / n1) + (var2 / n2))
    df = ((var1 / n1) + (var2 / n2))**2 / (((var1 / n1)**2 / (n1 - 1)) + ((var2 / n2)**2 / (n2 - 1)))
    
    p_value = 2 * (1 - stats.t.cdf(abs(t_score), df))
    critical_value = stats.t.ppf(1 - alpha / 2, df)
    
    print(f"T-score: {t_score:.2f}")
    print(f"p-value: {p_value:.2f}")
    print(f"Critical Value: {critical_value:.2f}")
    
    if abs(t_score) > critical_value:
        return f"Reject the null hypothesis: The means of the two samples are significantly different."
    else:
        return f"Fail to reject the null hypothesis: The means of the two samples are not significantly different."

men_data = df[df['Gender'] == 'Men']['Amount']
women_data = df[df['Gender'] == 'Women']['Amount']
result = t_test_two_sample(men_data, women_data)
print(result)


T-score: 57.65
p-value: 0.00
Critical Value: 1.96
Reject the null hypothesis: The means of the two samples are significantly different.


In [76]:
stats.ttest_ind(men_data, women_data) 

TtestResult(statistic=54.641624219644086, pvalue=0.0, df=31045.0)