In [21]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
from scipy.stats import zscore
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from scipy.stats import mannwhitneyu, ttest_ind

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [22]:
def std_classification(df):
    temp = zscore(df)
    temp = [0 if l<=0.0 else 1 for l in temp.values]
    return temp

In [23]:
valid_uids = pd.read_csv("../out/our_overall_features.csv")[["uid"]]

mcq_scores = pd.read_csv("../data/mcq_scores.tsv", sep="\t")
mcq_scores = pd.merge(mcq_scores, valid_uids, on="uid", how="inner")
mcq_scores["mcq_class_kg"] = mcq_scores[["kg"]].apply(std_classification)
mcq_scores["mcq_class_pre"] = mcq_scores[["pre"]].apply(std_classification)
mcq_scores["mcq_class_post"] = mcq_scores[["post"]].apply(std_classification)

essay_scores = pd.read_csv("../data/essay_scores.csv")
essay_scores = pd.merge(essay_scores, valid_uids, on="uid", how="inner")
essay_scores["essay_class_kg"] = essay_scores[["kg"]].apply(std_classification)
essay_scores["essay_class_pre"] = essay_scores[["pre"]].apply(std_classification)
essay_scores["essay_class_post"] = essay_scores[["post"]].apply(std_classification)

In [24]:
print(mcq_scores[["mcq_class_kg", "uid"]].groupby("mcq_class_kg").count().reset_index())
print(mcq_scores[["mcq_class_pre", "uid"]].groupby("mcq_class_pre").count().reset_index())
print(mcq_scores[["mcq_class_post", "uid"]].groupby("mcq_class_post").count().reset_index())
print(essay_scores[["essay_class_kg", "uid"]].groupby("essay_class_kg").count().reset_index())
print(essay_scores[["essay_class_pre", "uid"]].groupby("essay_class_pre").count().reset_index())
print(essay_scores[["essay_class_post", "uid"]].groupby("essay_class_post").count().reset_index())

   mcq_class_kg  uid
0             0   66
1             1   41
   mcq_class_pre  uid
0              0   63
1              1   44
   mcq_class_post  uid
0               0   54
1               1   53
   essay_class_kg  uid
0               0   46
1               1   61
   essay_class_pre  uid
0                0   57
1                1   50
   essay_class_post  uid
0                 0   54
1                 1   53


In [25]:
import re
tests_corr = pd.merge(mcq_scores[["uid", "pre", "post", "kg"]], essay_scores[["uid", "pre", "post", "kg"]], on="uid", how="inner").rename(columns=lambda x: re.sub("_x", "_mcq", x)).rename(columns=lambda x: re.sub("_y", "_essay", x)).drop(columns=["uid"]).corr().round(2)
tests_corr.to_markdown("../results/tests_correlation.md", index=False)

In [26]:
features_lightning = pd.read_csv("../out/our_content_page_features.csv")
features = pd.merge(features_lightning, mcq_scores, on="uid", how="right")
features = pd.merge(features, essay_scores, on="uid", how="right")

cols = features.drop(columns=["source", "task", "acode", "uid", "pre", "post", "kg", "mcq_class_kg", "mcq_class_pre", "mcq_class_post", "essay_class_kg", "essay_class_pre", "essay_class_post"], errors="ignore").columns

res = []

for col in cols:
    mcq_class_kg = mannwhitneyu(features.loc[features["mcq_class_kg"] == 0, col].dropna(), features.loc[features["mcq_class_kg"] == 1, col].dropna())[1]
    mcq_class_pre = mannwhitneyu(features.loc[features["mcq_class_pre"] == 0, col].dropna(), features.loc[features["mcq_class_pre"] == 1, col].dropna())[1]
    mcq_class_post = mannwhitneyu(features.loc[features["mcq_class_post"] == 0, col].dropna(), features.loc[features["mcq_class_post"] == 1, col].dropna())[1]
    essay_class_kg = mannwhitneyu(features.loc[features["essay_class_kg"] == 0, col].dropna(), features.loc[features["essay_class_kg"] == 1, col].dropna())[1]
    essay_class_pre = mannwhitneyu(features.loc[features["essay_class_pre"] == 0, col].dropna(), features.loc[features["essay_class_pre"] == 1, col].dropna())[1]
    essay_class_post = mannwhitneyu(features.loc[features["essay_class_post"] == 0, col].dropna(), features.loc[features["essay_class_post"] == 1, col].dropna())[1]
    res.append([col, mcq_class_pre, essay_class_pre, mcq_class_post, essay_class_post, mcq_class_kg, essay_class_kg])
    
df = pd.DataFrame(res, columns=["Feature Name", "mcq_class_pre", "essay_class_pre", "mcq_class_post", "essay_class_post", "mcq_class_kg", "essay_class_kg"])
#print(df.to_latex(index=False, float_format="{:.3f}".format))
df.to_markdown("../results/our_MWU_p-values.md", index=False)
df

Unnamed: 0,Feature Name,mcq_class_pre,essay_class_pre,mcq_class_post,essay_class_post,mcq_class_kg,essay_class_kg
0,n_CP_visited,0.278,0.816,0.345,0.177,0.768,0.17
1,sum_fix_dur,0.039,0.311,0.32,0.003,0.508,0.069
2,mean_fix_dur,0.094,0.829,0.034,0.179,0.015,0.059
3,n_fixs,0.139,0.345,0.134,0.0,0.92,0.017
4,max_sum_reading_dur_per_content-page,0.3,0.564,0.439,0.02,0.452,0.192
5,mean_sum_reading_dur_per_content-page,0.699,0.975,0.55,0.061,0.722,0.13
6,mean_dur_per_RS,0.131,0.53,0.371,0.338,0.079,0.046
7,n_RS,0.224,0.692,0.207,0.0,0.611,0.006
8,sum_RF_dur,0.088,0.732,0.402,0.003,0.747,0.04
9,mean_RF_dur_per_CP,0.699,0.975,0.55,0.061,0.722,0.13


In [27]:
features_lightning = pd.read_csv("../out/cole2011_content_pages_features.csv")
features = pd.merge(features_lightning, mcq_scores, on="uid", how="inner")
features = pd.merge(features, essay_scores, on="uid", how="inner")

cols = features.drop(columns=["source", "task", "acode", "uid", "pre", "post", "kg", "mcq_class_kg", "mcq_class_pre", "mcq_class_post", "essay_class_kg", "essay_class_pre", "essay_class_post"], errors="ignore").columns

res = []

for col in cols:
    mcq_class_kg = mannwhitneyu(features.loc[features["mcq_class_kg"] == 0, col].dropna(), features.loc[features["mcq_class_kg"] == 1, col].dropna())[1]
    mcq_class_pre = mannwhitneyu(features.loc[features["mcq_class_pre"] == 0, col].dropna(), features.loc[features["mcq_class_pre"] == 1, col].dropna())[1]
    mcq_class_post = mannwhitneyu(features.loc[features["mcq_class_post"] == 0, col].dropna(), features.loc[features["mcq_class_post"] == 1, col].dropna())[1]
    essay_class_kg = mannwhitneyu(features.loc[features["essay_class_kg"] == 0, col].dropna(), features.loc[features["essay_class_kg"] == 1, col].dropna())[1]
    essay_class_pre = mannwhitneyu(features.loc[features["essay_class_pre"] == 0, col].dropna(), features.loc[features["essay_class_pre"] == 1, col].dropna())[1]
    essay_class_post = mannwhitneyu(features.loc[features["essay_class_post"] == 0, col].dropna(), features.loc[features["essay_class_post"] == 1, col].dropna())[1]
    res.append([col, mcq_class_pre, essay_class_pre, mcq_class_post, essay_class_post, mcq_class_kg, essay_class_kg])
    
df = pd.DataFrame(res, columns=["Feature Name", "mcq_class_pre", "essay_class_pre", "mcq_class_post", "essay_class_post", "mcq_class_kg", "essay_class_kg"])
# print(df.to_latex(index=False, float_format="{:.3f}".format))
df.to_markdown("../results/cole2011_MWU_p-values.md", index=False)
df

FileNotFoundError: [Errno 2] No such file or directory: '../out/cole2011_content_pages_features.csv'