In [1]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
from scipy.stats import zscore
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from scipy.stats import mannwhitneyu, ttest_ind

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
def std_classification(df):
    temp = zscore(df)
    temp = [0 if l<=0.0 else 1 for l in temp.values]
    return temp

In [3]:
valid_uids = pd.read_csv("../out/our_overall_features.csv")[["uid"]]

mcq_scores = pd.read_csv("../data/mcq_scores.tsv", sep="\t")
mcq_scores = pd.merge(mcq_scores, valid_uids, on="uid", how="inner")
mcq_scores["mcq_class_kg"] = mcq_scores[["kg"]].apply(std_classification)
mcq_scores["mcq_class_pre"] = mcq_scores[["pre"]].apply(std_classification)
mcq_scores["mcq_class_post"] = mcq_scores[["post"]].apply(std_classification)

essay_scores = pd.read_csv("../data/essay_scores.csv")
essay_scores = pd.merge(essay_scores, valid_uids, on="uid", how="inner")
essay_scores["essay_class_kg"] = essay_scores[["kg"]].apply(std_classification)
essay_scores["essay_class_pre"] = essay_scores[["pre"]].apply(std_classification)
essay_scores["essay_class_post"] = essay_scores[["post"]].apply(std_classification)

In [4]:
print(mcq_scores[["mcq_class_kg", "uid"]].groupby("mcq_class_kg").count().reset_index())
print(mcq_scores[["mcq_class_pre", "uid"]].groupby("mcq_class_pre").count().reset_index())
print(mcq_scores[["mcq_class_post", "uid"]].groupby("mcq_class_post").count().reset_index())
print(essay_scores[["essay_class_kg", "uid"]].groupby("essay_class_kg").count().reset_index())
print(essay_scores[["essay_class_pre", "uid"]].groupby("essay_class_pre").count().reset_index())
print(essay_scores[["essay_class_post", "uid"]].groupby("essay_class_post").count().reset_index())

   mcq_class_kg  uid
0             0   66
1             1   41
   mcq_class_pre  uid
0              0   63
1              1   44
   mcq_class_post  uid
0               0   54
1               1   53
   essay_class_kg  uid
0               0   46
1               1   61
   essay_class_pre  uid
0                0   57
1                1   50
   essay_class_post  uid
0                 0   54
1                 1   53


In [5]:
import re
tests_corr = pd.merge(mcq_scores[["uid", "pre", "post", "kg"]], essay_scores[["uid", "pre", "post", "kg"]], on="uid", how="inner").rename(columns=lambda x: re.sub("_x", "_mcq", x)).rename(columns=lambda x: re.sub("_y", "_essay", x)).drop(columns=["uid"]).corr().round(2)
tests_corr.to_markdown("../results/tests_correlation.md", index=False)

In [6]:
features_lightning = pd.read_csv("../out/our_overall_features.csv")
features = pd.merge(features_lightning, mcq_scores, on="uid", how="right")
features = pd.merge(features, essay_scores, on="uid", how="right")
print(features.shape)

cols = features.drop(columns=["source", "task", "acode", "uid", "pre", "post", "kg", "mcq_class_kg", "mcq_class_pre", "mcq_class_post", "essay_class_kg", "essay_class_pre", "essay_class_post"], errors="ignore").columns

res = []

for col in cols:
    mcq_class_kg = mannwhitneyu(features.loc[features["mcq_class_kg"] == 0, col].dropna(), features.loc[features["mcq_class_kg"] == 1, col].dropna())[1]
    mcq_class_pre = mannwhitneyu(features.loc[features["mcq_class_pre"] == 0, col].dropna(), features.loc[features["mcq_class_pre"] == 1, col].dropna())[1]
    mcq_class_post = mannwhitneyu(features.loc[features["mcq_class_post"] == 0, col].dropna(), features.loc[features["mcq_class_post"] == 1, col].dropna())[1]
    essay_class_kg = mannwhitneyu(features.loc[features["essay_class_kg"] == 0, col].dropna(), features.loc[features["essay_class_kg"] == 1, col].dropna())[1]
    essay_class_pre = mannwhitneyu(features.loc[features["essay_class_pre"] == 0, col].dropna(), features.loc[features["essay_class_pre"] == 1, col].dropna())[1]
    essay_class_post = mannwhitneyu(features.loc[features["essay_class_post"] == 0, col].dropna(), features.loc[features["essay_class_post"] == 1, col].dropna())[1]
    res.append([col, mcq_class_pre, essay_class_pre, mcq_class_post, essay_class_post, mcq_class_kg, essay_class_kg])
    
df = pd.DataFrame(res, columns=["Feature Name", "mcq_class_pre", "essay_class_pre", "mcq_class_post", "essay_class_post", "mcq_class_kg", "essay_class_kg"])
#print(df.to_latex(index=False, float_format="{:.3f}".format))
df.to_markdown("../results/our_serp_MWU_p-values.md", index=False)
df

(107, 54)


Unnamed: 0,Feature Name,mcq_class_pre,essay_class_pre,mcq_class_post,essay_class_post,mcq_class_kg,essay_class_kg
0,n_CP_visited,0.514,0.898,0.269,0.115,0.936,0.258
1,sum_fix_dur,0.114,0.444,0.326,0.029,0.448,0.292
2,mean_fix_dur,0.077,0.859,0.024,0.164,0.006,0.048
3,n_fixs,0.374,0.541,0.076,0.007,0.901,0.106
4,max_sum_reading_dur_per_content-page,0.55,0.893,0.744,0.065,0.479,0.347
5,mean_sum_reading_dur_per_content-page,0.947,0.829,0.938,0.158,0.957,0.181
6,mean_dur_per_RS,0.193,0.849,0.291,0.361,0.047,0.07
7,n_RS,0.412,0.99,0.238,0.001,0.684,0.015
8,sum_RF_dur,0.207,0.958,0.503,0.009,0.656,0.094
9,mean_RF_dur_per_CP,0.947,0.829,0.938,0.158,0.957,0.181


In [7]:
features_lightning = pd.read_csv("../out/cole2011_overall_features.csv")
features = pd.merge(features_lightning, mcq_scores, on="uid", how="inner")
features = pd.merge(features, essay_scores, on="uid", how="inner")

cols = features.drop(columns=["source", "task", "acode", "uid", "pre", "post", "kg", "mcq_class_kg", "mcq_class_pre", "mcq_class_post", "essay_class_kg", "essay_class_pre", "essay_class_post"], errors="ignore").columns

res = []

for col in cols:
    mcq_class_kg = mannwhitneyu(features.loc[features["mcq_class_kg"] == 0, col].dropna(), features.loc[features["mcq_class_kg"] == 1, col].dropna())[1]
    mcq_class_pre = mannwhitneyu(features.loc[features["mcq_class_pre"] == 0, col].dropna(), features.loc[features["mcq_class_pre"] == 1, col].dropna())[1]
    mcq_class_post = mannwhitneyu(features.loc[features["mcq_class_post"] == 0, col].dropna(), features.loc[features["mcq_class_post"] == 1, col].dropna())[1]
    essay_class_kg = mannwhitneyu(features.loc[features["essay_class_kg"] == 0, col].dropna(), features.loc[features["essay_class_kg"] == 1, col].dropna())[1]
    essay_class_pre = mannwhitneyu(features.loc[features["essay_class_pre"] == 0, col].dropna(), features.loc[features["essay_class_pre"] == 1, col].dropna())[1]
    essay_class_post = mannwhitneyu(features.loc[features["essay_class_post"] == 0, col].dropna(), features.loc[features["essay_class_post"] == 1, col].dropna())[1]
    res.append([col, mcq_class_pre, essay_class_pre, mcq_class_post, essay_class_post, mcq_class_kg, essay_class_kg])
    
df = pd.DataFrame(res, columns=["Feature Name", "mcq_class_pre", "essay_class_pre", "mcq_class_post", "essay_class_post", "mcq_class_kg", "essay_class_kg"])
# print(df.to_latex(index=False, float_format="{:.3f}".format))
df.to_markdown("../results/cole2011_serp_MWU_p-values.md", index=False)
df

Unnamed: 0,Feature Name,mcq_class_pre,essay_class_pre,mcq_class_post,essay_class_post,mcq_class_kg,essay_class_kg
0,n_CP_visited,0.468,0.856,0.246,0.197,0.795,0.202
1,sum_fix_dur,0.141,0.844,0.345,0.531,0.541,0.292
2,mean_fix_dur,0.24,0.81,0.107,0.013,0.091,0.068
3,n_fixs,0.317,0.626,0.105,0.064,0.825,0.03
4,max_sum_reading_dur_per_content-page,0.298,0.642,0.177,0.179,0.483,0.527
5,mean_sum_reading_dur_per_content-page,0.832,0.559,0.556,0.266,0.845,0.982
6,mean_dur_per_RS,0.402,0.978,0.048,0.126,0.015,0.156
7,n_RS,0.664,0.76,0.08,0.002,0.513,0.042
8,sum_RF_dur,0.416,0.786,0.263,0.016,0.83,0.124
9,mean_RF_dur_per_CP,0.832,0.559,0.556,0.266,0.845,0.982
