In [35]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
from scipy.stats import zscore
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from scipy.stats import mannwhitneyu, ttest_ind

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [36]:
def std_classification(df):
    temp = zscore(df)
    temp = [0 if l<=0.0 else 1 for l in temp.values]
    return temp

In [37]:
valid_uids = pd.read_csv("../out/our_overall_features.csv")[["uid"]]

mcq_scores = pd.read_csv("../data/mcq_scores.tsv", sep="\t")
mcq_scores = pd.merge(mcq_scores, valid_uids, on="uid", how="inner")
mcq_scores["mcq_class_kg"] = mcq_scores[["kg"]].apply(std_classification)
mcq_scores["mcq_class_pre"] = mcq_scores[["pre"]].apply(std_classification)
mcq_scores["mcq_class_post"] = mcq_scores[["post"]].apply(std_classification)

essay_scores = pd.read_csv("../data/essay_scores.csv")
essay_scores = pd.merge(essay_scores, valid_uids, on="uid", how="inner")
essay_scores["essay_class_kg"] = essay_scores[["kg"]].apply(std_classification)
essay_scores["essay_class_pre"] = essay_scores[["pre"]].apply(std_classification)
essay_scores["essay_class_post"] = essay_scores[["post"]].apply(std_classification)

In [38]:
print(mcq_scores[["mcq_class_kg", "uid"]].groupby("mcq_class_kg").count().reset_index())
print(mcq_scores[["mcq_class_pre", "uid"]].groupby("mcq_class_pre").count().reset_index())
print(mcq_scores[["mcq_class_post", "uid"]].groupby("mcq_class_post").count().reset_index())
print(essay_scores[["essay_class_kg", "uid"]].groupby("essay_class_kg").count().reset_index())
print(essay_scores[["essay_class_pre", "uid"]].groupby("essay_class_pre").count().reset_index())
print(essay_scores[["essay_class_post", "uid"]].groupby("essay_class_post").count().reset_index())

   mcq_class_kg  uid
0             0   66
1             1   41
   mcq_class_pre  uid
0              0   63
1              1   44
   mcq_class_post  uid
0               0   54
1               1   53
   essay_class_kg  uid
0               0   46
1               1   61
   essay_class_pre  uid
0                0   57
1                1   50
   essay_class_post  uid
0                 0   54
1                 1   53


In [39]:
import re
tests_corr = pd.merge(mcq_scores[["uid", "pre", "post", "kg"]], essay_scores[["uid", "pre", "post", "kg"]], on="uid", how="inner").rename(columns=lambda x: re.sub("_x", "_mcq", x)).rename(columns=lambda x: re.sub("_y", "_essay", x)).drop(columns=["uid"]).corr().round(2)
tests_corr.to_markdown("../results/tests_correlation.md", index=False)

In [40]:
features_lightning = pd.read_csv("../out/our_serp_features.csv")
features = pd.merge(features_lightning, mcq_scores, on="uid", how="right")
features = pd.merge(features, essay_scores, on="uid", how="right")
print(features.shape)

cols = features.drop(columns=["source", "task", "acode", "uid", "pre", "post", "kg", "mcq_class_kg", "mcq_class_pre", "mcq_class_post", "essay_class_kg", "essay_class_pre", "essay_class_post"], errors="ignore").columns

res = []

for col in cols:
    mcq_class_kg = mannwhitneyu(features.loc[features["mcq_class_kg"] == 0, col].dropna(), features.loc[features["mcq_class_kg"] == 1, col].dropna())[1]
    mcq_class_pre = mannwhitneyu(features.loc[features["mcq_class_pre"] == 0, col].dropna(), features.loc[features["mcq_class_pre"] == 1, col].dropna())[1]
    mcq_class_post = mannwhitneyu(features.loc[features["mcq_class_post"] == 0, col].dropna(), features.loc[features["mcq_class_post"] == 1, col].dropna())[1]
    essay_class_kg = mannwhitneyu(features.loc[features["essay_class_kg"] == 0, col].dropna(), features.loc[features["essay_class_kg"] == 1, col].dropna())[1]
    essay_class_pre = mannwhitneyu(features.loc[features["essay_class_pre"] == 0, col].dropna(), features.loc[features["essay_class_pre"] == 1, col].dropna())[1]
    essay_class_post = mannwhitneyu(features.loc[features["essay_class_post"] == 0, col].dropna(), features.loc[features["essay_class_post"] == 1, col].dropna())[1]
    res.append([col, mcq_class_pre, essay_class_pre, mcq_class_post, essay_class_post, mcq_class_kg, essay_class_kg])
    
df = pd.DataFrame(res, columns=["Feature Name", "mcq_class_pre", "essay_class_pre", "mcq_class_post", "essay_class_post", "mcq_class_kg", "essay_class_kg"])
#print(df.to_latex(index=False, float_format="{:.3f}".format))
df.to_markdown("../results/our_serp_MWU_p-values.md", index=False)
df

(107, 54)


Unnamed: 0,Feature Name,mcq_class_pre,essay_class_pre,mcq_class_post,essay_class_post,mcq_class_kg,essay_class_kg
0,n_CP_visited,0.848,0.356,0.104,0.041,0.238,0.078
1,sum_fix_dur,0.37,0.122,0.31,0.052,0.273,0.15
2,mean_fix_dur,0.084,0.756,0.465,0.153,0.5,0.065
3,n_fixs,0.268,0.168,0.31,0.047,0.307,0.101
4,max_sum_reading_dur_per_content-page,0.759,0.204,0.87,0.911,0.656,0.805
5,mean_sum_reading_dur_per_content-page,0.969,0.295,0.887,0.45,0.517,0.696
6,mean_dur_per_RS,0.652,0.128,0.163,0.419,0.621,0.435
7,n_RS,0.923,0.216,0.337,0.054,0.825,0.082
8,sum_RF_dur,0.913,0.146,0.465,0.069,0.78,0.109
9,mean_RF_dur_per_CP,0.969,0.295,0.887,0.45,0.517,0.696


In [41]:
features_lightning = pd.read_csv("../out/cole2011_serp_features.csv")
features = pd.merge(features_lightning, mcq_scores, on="uid", how="inner")
features = pd.merge(features, essay_scores, on="uid", how="inner")

cols = features.drop(columns=["source", "task", "acode", "uid", "pre", "post", "kg", "mcq_class_kg", "mcq_class_pre", "mcq_class_post", "essay_class_kg", "essay_class_pre", "essay_class_post"], errors="ignore").columns

res = []

for col in cols:
    mcq_class_kg = mannwhitneyu(features.loc[features["mcq_class_kg"] == 0, col].dropna(), features.loc[features["mcq_class_kg"] == 1, col].dropna())[1]
    mcq_class_pre = mannwhitneyu(features.loc[features["mcq_class_pre"] == 0, col].dropna(), features.loc[features["mcq_class_pre"] == 1, col].dropna())[1]
    mcq_class_post = mannwhitneyu(features.loc[features["mcq_class_post"] == 0, col].dropna(), features.loc[features["mcq_class_post"] == 1, col].dropna())[1]
    essay_class_kg = mannwhitneyu(features.loc[features["essay_class_kg"] == 0, col].dropna(), features.loc[features["essay_class_kg"] == 1, col].dropna())[1]
    essay_class_pre = mannwhitneyu(features.loc[features["essay_class_pre"] == 0, col].dropna(), features.loc[features["essay_class_pre"] == 1, col].dropna())[1]
    essay_class_post = mannwhitneyu(features.loc[features["essay_class_post"] == 0, col].dropna(), features.loc[features["essay_class_post"] == 1, col].dropna())[1]
    res.append([col, mcq_class_pre, essay_class_pre, mcq_class_post, essay_class_post, mcq_class_kg, essay_class_kg])
    
df = pd.DataFrame(res, columns=["Feature Name", "mcq_class_pre", "essay_class_pre", "mcq_class_post", "essay_class_post", "mcq_class_kg", "essay_class_kg"])
# print(df.to_latex(index=False, float_format="{:.3f}".format))
df.to_markdown("../results/cole2011_serp_MWU_p-values.md", index=False)
df

Unnamed: 0,Feature Name,mcq_class_pre,essay_class_pre,mcq_class_post,essay_class_post,mcq_class_kg,essay_class_kg
0,n_CP_visited,0.858,0.572,0.73,0.155,0.503,0.362
1,sum_fix_dur,0.542,0.082,0.601,0.339,0.752,0.433
2,mean_fix_dur,0.141,0.524,0.165,0.324,0.153,0.086
3,n_fixs,0.331,0.096,0.733,0.309,0.871,0.303
4,max_sum_reading_dur_per_content-page,0.375,0.266,0.514,0.592,0.143,0.466
5,mean_sum_reading_dur_per_content-page,0.512,0.235,0.43,0.499,0.184,0.326
6,mean_dur_per_RS,0.351,0.127,0.231,0.957,0.076,0.812
7,n_RS,0.271,0.774,0.945,0.751,0.35,0.496
8,sum_RF_dur,0.25,0.395,0.413,0.87,0.109,0.601
9,mean_RF_dur_per_CP,0.512,0.235,0.43,0.499,0.184,0.326
