In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline
plt.rcParams['figure.figsize'] = [16, 9]
plt.rcParams['figure.dpi'] = 100
plt.rcParams.update({'font.size': 18})

In [3]:
df = pd.read_csv('StudentsPerformance.csv')
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


Phát biểu giả thiết:

- __$H_0:$__ Điểm môn đọc không phụ thuộc vào giới tính.
- __$H_1:$__ Giới tính có ảnh hưởng đến điểm môn đọc.

In [8]:
bins = [0, 60, 75, 85, 100] # => Phân chia thang điểm

gen_read_df = df[['gender', 'reading score']].copy()
gen_read_df['ThangDiem'] = pd.cut(gen_read_df['reading score'], bins, labels=['D', 'C', 'B', 'A'])
gen_read_df

Unnamed: 0,gender,reading score,ThangDiem
0,female,72,C
1,female,90,A
2,female,95,A
3,male,57,D
4,male,78,B
...,...,...,...
995,female,99,A
996,male,55,D
997,female,71,C
998,female,78,B


In [6]:
def results(p, alpha=0.05):
    cols=['score', 'p_value', 'dof', 'KetLuan']
    if p['p_value'] < alpha:
        p['KetLuan'] = f"Chấp nhận H1 với mức ý nghĩa {alpha}"
    if p['p_value'] >= alpha:
        p['KetLuan'] = f"Chấp nhận H0 với mức ý nghĩa {alpha}"
    df = pd.DataFrame(p, index=[''])
    return df[cols]

In [10]:
count_df = pd.crosstab(index=gen_read_df['gender'], columns=gen_read_df['ThangDiem'])
count_df


ThangDiem,D,C,B,A
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,98,195,123,102
male,177,186,89,30


In [11]:
score, p_value, dof, expected = stats.chi2_contingency(count_df)

In [12]:
p ={}
p['score'] = score
p['p_value'] = p_value
p['dof'] = dof
results(p)

Unnamed: 0,score,p_value,dof,KetLuan
,66.422785,2.48869e-14,3,Chấp nhận H1 với mức ý nghĩa 0.05


### Nghiên cứu lunch có ảnh hưởng đến điểm môn Toán

Phát biểu giả thiết:

- $H_0$: Chế độ ăn không ảnh hưởng đến điểm môn Toán
- $H_1$: Chế độ ăn ảnh hưởng đến điểm môn Toán

In [14]:
bins = [0, 60, 75, 85, 100] # => Phân chia thang điểm

lun_math_df = df[['lunch', 'math score']].copy()
lun_math_df['ThangDiem'] = pd.cut(lun_math_df['math score'], bins, labels=['D', 'C', 'B', 'A'])
lun_math_df

Unnamed: 0,lunch,math score,ThangDiem
0,standard,72,C
1,standard,69,C
2,standard,90,A
3,free/reduced,47,D
4,standard,76,B
...,...,...,...
995,standard,88,A
996,free/reduced,62,C
997,free/reduced,59,D
998,standard,68,C


In [15]:
count_df = pd.crosstab(index=lun_math_df['lunch'], columns=lun_math_df['ThangDiem'])
count_df


ThangDiem,D,C,B,A
lunch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
free/reduced,181,124,37,12
standard,157,263,134,91


In [16]:
score, p_value, dof, expected = stats.chi2_contingency(count_df)
p ={}
p['score'] = score
p['p_value'] = p_value
p['dof'] = dof
results(p)

Unnamed: 0,score,p_value,dof,KetLuan
,90.126341,2.0578819999999999e-19,3,Chấp nhận H1 với mức ý nghĩa 0.05
