In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import kruskal
from scipy.stats import mannwhitneyu

np.set_printoptions(suppress=True)

# 分知识点看相关性

In [62]:
kp_data = pd.read_csv('/home/tiger/archived-data/aeolus-data/20210224/11/38586485-课程实验-标尺题-查询19.csv', encoding = 'gb18030')
kp_data.loc[(kp_data['mastery_degree'] >= 80), 'mastery_degree_gt80'] = 1
kp_data.loc[(kp_data['mastery_degree'] < 80), 'mastery_degree_gt80'] = 0

kp_data = kp_data.rename(columns = {"point2_id": "kp_id"})

kp_data.head()

Unnamed: 0,user_id,kp_id,point_name,correct_num,mastery_degree,mastery_degree_gt80
0,70016326174621,6811362936011882759,感官动词的用法,2,92.0,1.0
1,4001861800831800,6811363085840826632,询问是谁,1,60.0,0.0
2,3378112817023000,6811316311600136455,start,1,81.0,1.0
3,880021667334055,6811316269623542024,wash up,0,96.0,1.0
4,70016326174621,6838445552577839374,assistant,1,93.0,1.0


In [63]:
kp_data.groupby(['mastery_degree_gt80']).size()

mastery_degree_gt80
0.0     4748
1.0    24110
dtype: int64

In [64]:
kp_list = kp_data.kp_id.unique()
kp_summary = pd.DataFrame(columns = ["kp_id", "kp_name","md_correct2", "md_correct1", "md_wrong", "min_sample", "p_value", "mastery_degree_gt80_perc"])

In [65]:
len(kp_list)

116

## 答对2题 vs 答对1题答错1题 vs 答错2题

### Kruskal-Wallis H Test (ANOVA: for more than two independent samples)

The Kruskal-Wallis test is a nonparametric version of the one-way analysis of variance test or ANOVA for short. It is named for the developers of the method, William Kruskal and Wilson Wallis. This test can be used to determine whether more than two independent samples have a different distribution. It can be thought of as the generalization of the Mann-Whitney U test.

When the Kruskal-Wallis H-test leads to significant results, then at least one of the samples is different from the other samples. However, the test does not identify where the difference(s) occur. Moreover, it does not identify how many differences occur. To identify the particular differences between sample pairs, a researcher might use sample contrasts, or post hoc tests, to analyze the specific sample pairs for significant difference(s). The Mann-Whitney U-test is a useful method for performing sample contrasts between individual sample sets.

##### Fail to Reject H0: All sample distributions are equal.
##### Reject H0: One or more sample distributions are not equal.

In [67]:
for i in range(0,len(kp_list)):
    kp_id = kp_list[i]
    kp_data_s = kp_data[kp_data.kp_id == kp_id]
    kp_correct2 = kp_data_s[kp_data_s.correct_num == 2].mastery_degree
    kp_correct1 = kp_data_s[kp_data_s.correct_num == 1].mastery_degree
    kp_wrong = kp_data_s[kp_data_s.correct_num == 0].mastery_degree
    
    kp_name = kp_data_s.point_name.unique()
    correct2_user_cnt = len(kp_correct2)
    correct1_user_cnt = len(kp_correct1)
    wrong_user_cnt = len(kp_wrong)
 
    min_sample = min(len(kp_correct2), len(kp_correct1), len(kp_wrong))
    
    p_value = kruskal(kp_correct2, kp_correct1, kp_wrong).pvalue
    
    md_gt_80 = sum(kp_data_s.mastery_degree_gt80)/len(kp_data_s)
    kp_summary = kp_summary.append({'kp_id': kp_id, 'kp_name': kp_name, 'md_correct2': kp_correct2.mean(), 'md_correct1': kp_correct1.mean(), 'md_wrong': kp_wrong.mean(), 'min_sample':min_sample, 'p_value': p_value, 'mastery_degree_gt80_perc' : md_gt_80}, ignore_index = True)

kp_summary

Unnamed: 0,kp_id,kp_name,md_correct2,md_correct1,md_wrong,min_sample,p_value,mastery_degree_gt80_perc
0,6811362936011882759,[感官动词的用法],90.826667,89.311688,88.285714,21,0.046270,0.963710
1,6811363085840826632,[询问是谁],91.989583,91.116071,84.933333,45,0.000016,0.889328
2,6811316311600136455,[start],89.274611,87.040404,86.937500,16,0.045352,0.863636
3,6811316269623542024,[wash up],91.632911,92.328467,90.788732,71,0.195147,0.972125
4,6838445552577839374,[assistant],88.104167,88.240000,84.884615,26,0.070510,0.824121
...,...,...,...,...,...,...,...,...
111,6815459061853602061,[what引导的特殊疑问句-A1高],90.487179,88.447619,87.727273,11,0.444803,0.860825
112,6811316276800045320,[daughter],89.473684,85.138462,77.416667,12,0.000352,0.771242
113,6811316272375005454,[family],89.000000,86.645833,84.333333,6,0.405955,0.823899
114,6926826050169078029,[without表示没有],,88.142857,82.000000,0,,0.800000


In [72]:
kp_summary.loc[(kp_summary['min_sample'] >= 30), 'enough_sample_yn'] = 'Y'
kp_summary.loc[(kp_summary['min_sample'] < 30), 'enough_sample_yn'] = 'N'

kp_summary.loc[(kp_summary['p_value'] <= 0.05), 'sig_diff_yn'] = 'Y'
kp_summary.loc[(kp_summary['p_value'] > 0.05), 'sig_diff_yn'] = 'N'

kp_summary.loc[(kp_summary['mastery_degree_gt80_perc'] >= 0.9), 'mastery_degree_gt80_perc_flag'] = 'High_gt_90p'
kp_summary.loc[(kp_summary['mastery_degree_gt80_perc'] < 0.9) & (kp_summary['mastery_degree_gt80_perc'] > 0.7), 'mastery_degree_gt80_perc_flag'] = 'Mid_70p_90p'
kp_summary.loc[(kp_summary['mastery_degree_gt80_perc'] <= 0.7), 'mastery_degree_gt80_perc_flag'] = 'Low_lt_70p'

In [73]:
kp_summary.groupby(['sig_diff_yn']).size()

sig_diff_yn
N    64
Y    51
dtype: int64

In [22]:
51/(51+64)

0.4434782608695652

In [39]:
19/(19+22)

0.4634146341463415

In [74]:
kp_summary.groupby(['mastery_degree_gt80_perc_flag', 'sig_diff_yn']).size()

mastery_degree_gt80_perc_flag  sig_diff_yn
High_gt_90p                    N              28
                               Y               9
Low_lt_70p                     N               7
                               Y               8
Mid_70p_90p                    N              29
                               Y              34
dtype: int64

In [27]:
kp_summary['kp_id2'] = kp_summary['kp_id'].apply(str) + '\t'

In [28]:
kp_summary.sort_values(by=['mastery_degree_gt80_perc'])

Unnamed: 0,kp_id,kp_name,md_correct2,md_correct1,md_wrong,min_sample,p_value,mastery_degree_gt80_perc,enough_sample_yn,sig_diff_yn,mastery_degree_gt80_perc_flag,kp_id2
36,6811363090156781832,[姓氏名字],84.567797,75.252381,67.992424,66,3.441933e-19,0.475309,Y,Y,,6811363090156781832\t
34,6838445552577855758,[instruction],82.550000,81.824561,77.571429,20,8.172501e-02,0.500000,N,N,,6838445552577855758\t
88,6838448264082161933,[weird],83.875000,82.407407,80.335878,36,1.160316e-01,0.574468,Y,N,,6838448264082161933\t
69,6838444416491897096,[... is a big hit.],84.332278,78.508065,82.250000,8,6.387055e-03,0.578947,N,Y,,6838444416491897096\t
93,6811363085840843016,[称谓],83.007353,80.325000,76.751244,67,2.272149e-02,0.601190,Y,Y,,6811363085840843016\t
...,...,...,...,...,...,...,...,...,...,...,...,...
3,6811316269623542024,[wash up],91.632911,92.328467,90.788732,71,1.951472e-01,0.972125,Y,N,High_gt_90p,6811316269623542024\t
61,6838448369749246221,[感官动词表示好的感受],90.511111,89.225806,86.727273,11,8.114489e-03,0.975962,N,Y,High_gt_90p,6838448369749246221\t
23,6811316278272213262,[garden],90.533333,89.819048,90.142857,7,3.956875e-03,0.982072,N,Y,High_gt_90p,6811316278272213262\t
94,6815459061853585677,[who引导的特殊疑问句-A1高],93.100000,93.178571,88.888889,18,6.081217e-01,0.983516,N,N,High_gt_90p,6815459061853585677\t


In [29]:
kp_summary.to_csv(r'./mastery_dist.csv', encoding='utf_8_sig')

## 至少答对1题 vs 答错2题

### The Mann-Whitney U test (for two independent samples)

The Mann-Whitney U test is a nonparametric statistical significance test for determining whether two independent samples were drawn from a population with the same distribution.
More specifically, the test determines whether it is equally likely that any randomly selected observation from one sample will be greater or less than a sample in the other distribution. If violated, it suggests differing distributions.

##### Fail to Reject H0: Sample distributions are equal.
##### Reject H0: Sample distributions are not equal.

In [75]:
kp_summary = pd.DataFrame(columns = ["kp_id", "kp_name","md_correct", "md_wrong", "min_sample", "p_value", "mastery_degree_gt80_perc"])

In [76]:
for i in range(0,len(kp_list)):
    kp_id = kp_list[i]
    kp_data_s = kp_data[kp_data.kp_id == kp_id]
    kp_correct = kp_data_s[kp_data_s.correct_num == 2].mastery_degree
    kp_wrong = kp_data_s[kp_data_s.correct_num <= 1].mastery_degree
    
    kp_name = kp_data_s.point_name.unique()
    correct_user_cnt = len(kp_correct)
    wrong_user_cnt = len(kp_wrong)
 
    min_sample = min(len(kp_correct), len(kp_wrong))
    
    p_value = mannwhitneyu(kp_correct, kp_wrong).pvalue
    
    md_gt_80 = sum(kp_data_s.mastery_degree_gt80)/len(kp_data_s)
    kp_summary = kp_summary.append({'kp_id': kp_id, 'kp_name': kp_name, 'md_correct': kp_correct.mean(), 'md_wrong': kp_wrong.mean(), 'min_sample':min_sample, 'p_value': p_value, 'mastery_degree_gt80_perc' : md_gt_80}, ignore_index = True)

kp_summary

  z = (bigu - meanrank) / sd


Unnamed: 0,kp_id,kp_name,md_correct,md_wrong,min_sample,p_value,mastery_degree_gt80_perc
0,6811362936011882759,[感官动词的用法],90.826667,89.091837,98,0.006700,0.963710
1,6811363085840826632,[询问是谁],91.989583,89.343949,96,0.000721,0.889328
2,6811316311600136455,[start],89.274611,87.026087,115,0.009957,0.863636
3,6811316269623542024,[wash up],91.632911,91.802885,79,0.354177,0.972125
4,6838445552577839374,[assistant],88.104167,87.662252,48,0.223077,0.824121
...,...,...,...,...,...,...,...
111,6815459061853602061,[what引导的特殊疑问句-A1高],90.487179,88.379310,78,0.136342,0.860825
112,6811316276800045320,[daughter],89.473684,83.935065,76,0.001259,0.771242
113,6811316272375005454,[family],89.000000,86.388889,54,0.137554,0.823899
114,6926826050169078029,[without表示没有],,86.300000,0,0.000000,0.800000


In [77]:
kp_summary.loc[(kp_summary['min_sample'] >= 30), 'enough_sample_yn'] = 'Y'
kp_summary.loc[(kp_summary['min_sample'] < 30), 'enough_sample_yn'] = 'N'

kp_summary.loc[(kp_summary['p_value'] <= 0.05), 'sig_diff_yn'] = 'Y'
kp_summary.loc[(kp_summary['p_value'] > 0.05), 'sig_diff_yn'] = 'N'

kp_summary.loc[(kp_summary['mastery_degree_gt80_perc'] >= 0.9), 'mastery_degree_gt80_perc_flag'] = 'High_gt_90p'
kp_summary.loc[(kp_summary['mastery_degree_gt80_perc'] < 0.9) & (kp_summary['mastery_degree_gt80_perc'] > 0.7), 'mastery_degree_gt80_perc_flag'] = 'Mid_70p_90p'
kp_summary.loc[(kp_summary['mastery_degree_gt80_perc'] <= 0.7), 'mastery_degree_gt80_perc_flag'] = 'Low_lt_70p'

kp_summary.groupby(['sig_diff_yn']).size()

sig_diff_yn
N    52
Y    64
dtype: int64

In [45]:
kp_summary.groupby(['enough_sample_yn', 'sig_diff_yn']).size()

enough_sample_yn  sig_diff_yn
N                 N               3
                  Y               1
Y                 N              51
                  Y              60
dtype: int64

In [78]:
kp_summary.groupby(['mastery_degree_gt80_perc_flag', 'sig_diff_yn']).size()

mastery_degree_gt80_perc_flag  sig_diff_yn
High_gt_90p                    N              20
                               Y              17
Low_lt_70p                     N               7
                               Y               8
Mid_70p_90p                    N              25
                               Y              39
dtype: int64

In [60]:
64/116

0.5517241379310345

In [54]:
39/(39+24)

0.6190476190476191

In [49]:
kp_summary['kp_id2'] = kp_summary['kp_id'].apply(str) + '\t'
kp_summary.sort_values(by=['mastery_degree_gt80_perc'])
kp_summary.to_csv(r'./mastery_dist.csv', encoding='utf_8_sig')

## 至少答对1题 vs 答错2题

In [80]:
kp_summary = pd.DataFrame(columns = ["kp_id", "kp_name","md_correct", "md_wrong", "min_sample", "p_value", "mastery_degree_gt80_perc"])

for i in range(0,len(kp_list)):
    kp_id = kp_list[i]
    kp_data_s = kp_data[kp_data.kp_id == kp_id]
    kp_correct = kp_data_s[kp_data_s.correct_num >= 1].mastery_degree
    kp_wrong = kp_data_s[kp_data_s.correct_num == 0].mastery_degree
    
    kp_name = kp_data_s.point_name.unique()
    correct_user_cnt = len(kp_correct)
    wrong_user_cnt = len(kp_wrong)
 
    min_sample = min(len(kp_correct), len(kp_wrong))
    
    p_value = mannwhitneyu(kp_correct, kp_wrong).pvalue
    
    md_gt_80 = sum(kp_data_s.mastery_degree_gt80)/len(kp_data_s)
    kp_summary = kp_summary.append({'kp_id': kp_id, 'kp_name': kp_name, 'md_correct': kp_correct.mean(), 'md_wrong': kp_wrong.mean(), 'min_sample':min_sample, 'p_value': p_value, 'mastery_degree_gt80_perc' : md_gt_80}, ignore_index = True)

kp_summary

Unnamed: 0,kp_id,kp_name,md_correct,md_wrong,min_sample,p_value,mastery_degree_gt80_perc
0,6811362936011882759,[感官动词的用法],90.312775,88.285714,21,0.139972,0.963710
1,6811363085840826632,[询问是谁],91.519231,84.933333,45,0.000006,0.889328
2,6811316311600136455,[start],88.517123,86.937500,16,0.062065,0.863636
3,6811316269623542024,[wash up],92.074074,90.788732,71,0.064118,0.972125
4,6838445552577839374,[assistant],88.202312,84.884615,26,0.011147,0.824121
...,...,...,...,...,...,...,...
111,6815459061853602061,[what引导的特殊疑问句-A1高],89.316940,87.727273,11,0.198099,0.860825
112,6811316276800045320,[daughter],87.475177,77.416667,12,0.000386,0.771242
113,6811316272375005454,[family],88.261438,84.333333,6,0.147617,0.823899
114,6926826050169078029,[without表示没有],88.142857,82.000000,3,0.244404,0.800000


In [81]:
kp_summary.loc[(kp_summary['min_sample'] >= 30), 'enough_sample_yn'] = 'Y'
kp_summary.loc[(kp_summary['min_sample'] < 30), 'enough_sample_yn'] = 'N'

kp_summary.loc[(kp_summary['p_value'] <= 0.05), 'sig_diff_yn'] = 'Y'
kp_summary.loc[(kp_summary['p_value'] > 0.05), 'sig_diff_yn'] = 'N'

kp_summary.loc[(kp_summary['mastery_degree_gt80_perc'] >= 0.9), 'mastery_degree_gt80_perc_flag'] = 'High_gt_90p'
kp_summary.loc[(kp_summary['mastery_degree_gt80_perc'] < 0.9) & (kp_summary['mastery_degree_gt80_perc'] > 0.7), 'mastery_degree_gt80_perc_flag'] = 'Mid_70p_90p'
kp_summary.loc[(kp_summary['mastery_degree_gt80_perc'] <= 0.7), 'mastery_degree_gt80_perc_flag'] = 'Low_lt_70p'

kp_summary.groupby(['sig_diff_yn']).size()

sig_diff_yn
N    64
Y    52
dtype: int64

In [82]:
kp_summary.groupby(['mastery_degree_gt80_perc_flag', 'sig_diff_yn']).size()

mastery_degree_gt80_perc_flag  sig_diff_yn
High_gt_90p                    N              29
                               Y               8
Low_lt_70p                     N               6
                               Y               9
Mid_70p_90p                    N              29
                               Y              35
dtype: int64

## 答对2题 vs 仅答对1听力 vs 仅答对1口语 vs 答错2题

In [89]:
kp_data = pd.read_csv('/home/tiger/archived-data/aeolus-data/20210225/11/38699050-课程实验-标尺题-加题目类型.csv', encoding = 'gb18030')
kp_data.loc[(kp_data['mastery_degree'] >= 80), 'mastery_degree_gt80'] = 1
kp_data.loc[(kp_data['mastery_degree'] < 80), 'mastery_degree_gt80'] = 0

kp_data = kp_data.rename(columns = {"point2_id": "kp_id"})

kp_data = kp_data.fillna(0)

kp_data.head()

Unnamed: 0,user_id,kp_id,point_name,listing_correct_num,oral_correct_num,mastery_degree,mastery_degree_gt80
0,4204188787545512,6811316258944893192,sink,1.0,1.0,90.0,1.0
1,264298928275959,6811316298425860365,close,0.0,1.0,96.0,1.0
2,4283335322378600,6811362996351156493,形容词性物主代词,0.0,0.0,94.0,1.0
3,1468536862543371,6811316268851790094,put away,1.0,1.0,88.0,1.0
4,4152174745490791,6811316255199346952,garage,1.0,1.0,86.0,1.0


In [90]:
kp_list = kp_data.kp_id.unique()
kp_summary = pd.DataFrame(columns = ["kp_id", "kp_name","md_correct2", "md_correct_l", "md_correct_o", "md_wrong", "min_sample", "p_value", "mastery_degree_gt80_perc"])

In [98]:
for i in range(0,len(kp_list)):
    kp_id = kp_list[i]
    kp_data_s = kp_data[kp_data.kp_id == kp_id]
    kp_correct2 = kp_data_s[(kp_data_s.listing_correct_num == 1) & (kp_data_s.oral_correct_num == 1)].mastery_degree
    kp_correct_l = kp_data_s[(kp_data_s.listing_correct_num == 1) & (kp_data_s.oral_correct_num == 0)].mastery_degree
    kp_correct_o = kp_data_s[(kp_data_s.listing_correct_num == 0) & (kp_data_s.oral_correct_num == 1)].mastery_degree
    kp_wrong = kp_data_s[(kp_data_s.listing_correct_num == 0) & (kp_data_s.oral_correct_num == 0)].mastery_degree
    
    kp_name = kp_data_s.point_name.unique()
    correct2_user_cnt = len(kp_correct2)
    correct_l_user_cnt = len(kp_correct_l)
    correct_o_user_cnt = len(kp_correct_o)
    wrong_user_cnt = len(kp_wrong)
 
    min_sample = min(len(kp_correct2), len(kp_correct_l), len(kp_correct_o), len(kp_wrong))
    
    p_value = kruskal(kp_correct2, kp_correct_l, kp_correct_o, kp_wrong).pvalue
    
    md_gt_80 = sum(kp_data_s.mastery_degree_gt80)/len(kp_data_s)
    kp_summary = kp_summary.append({'kp_id': kp_id, 'kp_name': kp_name, 'md_correct2': kp_correct2.mean(), 'md_correct_l': kp_correct_l.mean(), 'md_correct_o': kp_correct_o.mean(), 'md_wrong': kp_wrong.mean(), 'min_sample':min_sample, 'p_value': p_value, 'mastery_degree_gt80_perc' : md_gt_80}, ignore_index = True)

kp_summary

Unnamed: 0,kp_id,kp_name,md_correct2,md_correct_l,md_correct_o,md_wrong,min_sample,p_value,mastery_degree_gt80_perc
0,6811316258944893192,[sink],92.201794,90.688889,92.269231,89.750000,32,1.927241e-01,0.943069
1,6811316298425860365,[close],92.786765,90.105263,91.720238,87.862069,19,1.997586e-01,0.900568
2,6811362996351156493,[形容词性物主代词],83.000000,73.214286,84.643275,79.438596,28,2.176173e-08,0.650970
3,6811316268851790094,[put away],86.980952,83.434783,86.848101,85.097561,46,2.107008e-02,0.769231
4,6811316255199346952,[garage],84.986014,83.500000,84.000000,83.625000,40,5.712101e-01,0.672289
...,...,...,...,...,...,...,...,...,...
111,6867340182308307208,[这是/那是……],87.383333,83.578947,86.444444,78.916667,12,8.717499e-03,0.771186
112,6926826050169078029,[without表示没有],,82.000000,90.600000,82.000000,0,,0.800000
113,6811316329224634638,[interested],83.444444,84.300000,85.205882,78.666667,9,4.837786e-01,0.672222
114,6867341843286212878,[是/不是我的类型],87.096386,83.000000,83.037037,89.600000,5,1.293156e-01,0.755396


In [99]:
kp_summary.loc[(kp_summary['min_sample'] >= 30), 'enough_sample_yn'] = 'Y'
kp_summary.loc[(kp_summary['min_sample'] < 30), 'enough_sample_yn'] = 'N'

kp_summary.loc[(kp_summary['p_value'] <= 0.05), 'sig_diff_yn'] = 'Y'
kp_summary.loc[(kp_summary['p_value'] > 0.05), 'sig_diff_yn'] = 'N'

kp_summary.loc[(kp_summary['mastery_degree_gt80_perc'] >= 0.9), 'mastery_degree_gt80_perc_flag'] = 'High_gt_90p'
kp_summary.loc[(kp_summary['mastery_degree_gt80_perc'] < 0.9) & (kp_summary['mastery_degree_gt80_perc'] > 0.7), 'mastery_degree_gt80_perc_flag'] = 'Mid_70p_90p'
kp_summary.loc[(kp_summary['mastery_degree_gt80_perc'] <= 0.7), 'mastery_degree_gt80_perc_flag'] = 'Low_lt_70p'

kp_summary.groupby(['sig_diff_yn']).size()

sig_diff_yn
N    61
Y    53
dtype: int64

In [100]:
kp_summary.groupby(['mastery_degree_gt80_perc_flag', 'sig_diff_yn']).size()

mastery_degree_gt80_perc_flag  sig_diff_yn
High_gt_90p                    N              26
                               Y              11
Low_lt_70p                     N               7
                               Y               8
Mid_70p_90p                    N              28
                               Y              34
dtype: int64

## 仅答对1听力 vs 仅答对1口语

In [102]:
kp_summary = pd.DataFrame(columns = ["kp_id", "kp_name","md_correct_l", "md_correct_o", "min_sample", "p_value", "mastery_degree_gt80_perc"])

for i in range(0,len(kp_list)):
    kp_id = kp_list[i]
    kp_data_s = kp_data[kp_data.kp_id == kp_id]
    kp_correct_l = kp_data_s[(kp_data_s.listing_correct_num == 1) & (kp_data_s.oral_correct_num == 0)].mastery_degree
    kp_correct_o = kp_data_s[(kp_data_s.listing_correct_num == 0) & (kp_data_s.oral_correct_num == 1)].mastery_degree
    
    kp_name = kp_data_s.point_name.unique()
    correct_l_user_cnt = len(kp_correct_l)
    correct_o_user_cnt = len(kp_correct_o)
 
    min_sample = min(len(kp_correct_l), len(kp_correct_o))
    
    p_value = mannwhitneyu(kp_correct_l, kp_correct_o).pvalue
    
    md_gt_80 = sum(kp_data_s.mastery_degree_gt80)/len(kp_data_s)
    kp_summary = kp_summary.append({'kp_id': kp_id, 'kp_name': kp_name, 'md_correct_l': kp_correct_l.mean(), 'md_correct_o': kp_correct_o.mean(), 'min_sample':min_sample, 'p_value': p_value, 'mastery_degree_gt80_perc' : md_gt_80}, ignore_index = True)

kp_summary

  z = (bigu - meanrank) / sd


Unnamed: 0,kp_id,kp_name,md_correct_l,md_correct_o,min_sample,p_value,mastery_degree_gt80_perc
0,6811316258944893192,[sink],90.688889,92.269231,45,5.209460e-02,0.943069
1,6811316298425860365,[close],90.105263,91.720238,19,2.047753e-01,0.900568
2,6811362996351156493,[形容词性物主代词],73.214286,84.643275,28,8.523397e-08,0.650970
3,6811316268851790094,[put away],83.434783,86.848101,46,3.418999e-03,0.769231
4,6811316255199346952,[garage],83.500000,84.000000,40,2.064850e-01,0.672289
...,...,...,...,...,...,...,...
111,6867340182308307208,[这是/那是……],83.578947,86.444444,19,1.597558e-01,0.771186
112,6926826050169078029,[without表示没有],82.000000,90.600000,2,8.762212e-02,0.800000
113,6811316329224634638,[interested],84.300000,85.205882,20,3.563373e-01,0.672222
114,6867341843286212878,[是/不是我的类型],83.000000,83.037037,24,3.114261e-01,0.755396


In [103]:
kp_summary.loc[(kp_summary['min_sample'] >= 30), 'enough_sample_yn'] = 'Y'
kp_summary.loc[(kp_summary['min_sample'] < 30), 'enough_sample_yn'] = 'N'

kp_summary.loc[(kp_summary['p_value'] <= 0.05), 'sig_diff_yn'] = 'Y'
kp_summary.loc[(kp_summary['p_value'] > 0.05), 'sig_diff_yn'] = 'N'

kp_summary.loc[(kp_summary['mastery_degree_gt80_perc'] >= 0.9), 'mastery_degree_gt80_perc_flag'] = 'High_gt_90p'
kp_summary.loc[(kp_summary['mastery_degree_gt80_perc'] < 0.9) & (kp_summary['mastery_degree_gt80_perc'] > 0.7), 'mastery_degree_gt80_perc_flag'] = 'Mid_70p_90p'
kp_summary.loc[(kp_summary['mastery_degree_gt80_perc'] <= 0.7), 'mastery_degree_gt80_perc_flag'] = 'Low_lt_70p'

kp_summary.groupby(['sig_diff_yn']).size()

sig_diff_yn
N    95
Y    21
dtype: int64

In [104]:
kp_summary.groupby(['mastery_degree_gt80_perc_flag', 'sig_diff_yn']).size()

mastery_degree_gt80_perc_flag  sig_diff_yn
High_gt_90p                    N              32
                               Y               5
Low_lt_70p                     N              10
                               Y               5
Mid_70p_90p                    N              53
                               Y              11
dtype: int64

In [105]:
3682/28858

0.1275902695959526