In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.stats.proportion as smprop

In [2]:
data = pd.read_csv("demographics.csv")

## One-Sample T test

In [3]:
t_stat, p_value = stats.ttest_1samp(data['income'], popmean=70)

In [4]:
mean_income = data['income'].mean()
ci = stats.t.interval(confidence=0.95, df=len(data['income'])-1, loc=mean_income, scale=stats.sem(data['income']))

In [5]:
print("One-Sample T-test Results:")
print(f"Mean income: {mean_income:.2f}")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")
print(f"95% Confidence Interval: ({ci[0]:.2f}, {ci[1]:.2f})")

One-Sample T-test Results:
Mean income: 78.59
t-statistic: 1.7282
p-value: 0.0846
95% Confidence Interval: (68.82, 88.36)


## "Binomial" Test

In [6]:
#Məqsəd: kişi-qadın nisbətinin 50/50 olub-olmadığını yoxlayacağıq.

gender_counts = data['gender'].value_counts()
male_count = gender_counts.get('Male', 0) 
total_count = len(data['gender'])
male_proportion = male_count / total_count

In [7]:
print("\nGender Distribution:")
print(gender_counts)
print(f"Proportion of males: {male_proportion:.3f}")


Gender Distribution:
gender
Male      260
Female    250
Name: count, dtype: int64
Proportion of males: 0.510


In [8]:
binom_result = smprop.binom_test(male_count, total_count, prop=0.5)
ci_binom = smprop.proportion_confint(male_count, total_count, alpha=0.05, method='wilson')

In [9]:
print("\nBinomial Test (p=0.5):")
print(f"p-value: {binom_result:.4f}")
print(f"95% Confidence Interval for male proportion: ({ci_binom[0]:.3f}, {ci_binom[1]:.3f})")


Binomial Test (p=0.5):
p-value: 0.6903
95% Confidence Interval for male proportion: (0.467, 0.553)


In [10]:
#kişi subyektlərin nisbətinin 40% olub-olmadığını yoxlamaq.

binom_result_40 = smprop.binom_test(male_count, total_count, prop=0.4)
ci_binom_40 = smprop.proportion_confint(male_count, total_count, alpha=0.05, method='wilson')

In [11]:
print("\nBinomial Test (p=0.4):")
print(f"p-value: {binom_result_40:.6f}")
print(f"95% Confidence Interval for male proportion: ({ci_binom_40[0]:.3f}, {ci_binom_40[1]:.3f})")


Binomial Test (p=0.4):
p-value: 0.000001
95% Confidence Interval for male proportion: (0.467, 0.553)


## "Chi-Square" Test

In [12]:
#Məqsəd: Təhsil səviyyəsinin (educ) bərabər ehtimallarla paylanıb-paylanmadığını yoxlamaq.

educ_counts = data['educ'].value_counts()
n_categories = len(educ_counts)
expected_prop = 1 / n_categories
expected_counts = np.array([total_count * expected_prop] * n_categories)

In [13]:
print("\nEducation Level Distribution:")
print(educ_counts)


Education Level Distribution:
educ
High school degree              132
Did not complete high school    125
Some college                    113
College degree                  113
Post-undergraduate degree        27
Name: count, dtype: int64


In [14]:
chi2_stat, chi2_p = stats.chisquare(educ_counts, f_exp=expected_counts)

In [15]:
print("\nChi-Square Test (Equal Probabilities):")
print(f"Chi-square statistic: {chi2_stat:.3f}")
print(f"p-value: {chi2_p:.4f}")


Chi-Square Test (Equal Probabilities):
Chi-square statistic: 71.529
p-value: 0.0000


In [16]:
expected_values = expected_counts #gözlənilən dəyərlər - (nəzəri ehtimallara əsasən)
#Bu dəyərlər, bərabər ehtimallı paylama fərziyyəsinə əsasən hesablanıb. 
#Çünki 510 nəfər var və 5 təhsil səviyyəsi - hər biri üçün 510 / 5 = 102 nəfər gözlənilir.

residuals = (educ_counts - expected_values) / np.sqrt(expected_values) #qalıq dəyərlər = müşahidə - gözlənilən
#Müsbət qalıq - Gözləniləndən çox müşahidə edilib
#Mənfi qalıq - Gözləniləndən az müşahidə edilib

std_residuals = residuals / np.sqrt(1 - expected_prop) #standartlaşdırılmış qalıqlar - Qalıqların standartlaşdırılmış forması (yəni z-score formasında)
# |z| > 2 - statistik olaraq əhəmiyyətli fərq var
# |z| > 3 - çox əhəmiyyətli fərq

In [17]:
print("\nExpected Values:")
print(expected_values)
print("\nResiduals:")
print(residuals)
print("\nStandardized Residuals:")
print(std_residuals)


Expected Values:
[102. 102. 102. 102. 102.]

Residuals:
educ
High school degree              2.970443
Did not complete high school    2.277339
Some college                    1.089162
College degree                  1.089162
Post-undergraduate degree      -7.426107
Name: count, dtype: float64

Standardized Residuals:
educ
High school degree              3.321056
Did not complete high school    2.546143
Some college                    1.217720
College degree                  1.217720
Post-undergraduate degree      -8.302640
Name: count, dtype: float64


In [18]:
#Qeyri-bərabər nəzəri ehtimallarla Chi-Square Test - nin yerinə yetirilməsi.

custom_probs = [0.30, 0.30, 0.20, 0.10, 0.10]
expected_custom = np.array(custom_probs) * total_count
expected_custom

array([153., 153., 102.,  51.,  51.])

In [19]:
chi2_stat_custom, chi2_p_custom = stats.chisquare(educ_counts, f_exp=expected_custom)

In [20]:
print("\nChi-Square Test (Custom Probabilities [0.3, 0.3, 0.2, 0.1, 0.1]):")
print(f"Chi-square statistic: {chi2_stat_custom:.3f}")
print(f"p-value: {chi2_p_custom:.4f}")
print("\nExpected Values (Custom Probabilities):")
print(expected_custom)


Chi-Square Test (Custom Probabilities [0.3, 0.3, 0.2, 0.1, 0.1]):
Chi-square statistic: 95.859
p-value: 0.0000

Expected Values (Custom Probabilities):
[153. 153. 102.  51.  51.]
