In [31]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
from scipy.stats import zscore
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from scipy.stats import mannwhitneyu, ttest_ind

In [32]:
def std_classification(df):
    temp = zscore(df)
    temp = [0 if l<=0.0 else 1 for l in temp.values]
    return temp

In [33]:
valid_uids = pd.read_csv("../out/our_tracking_features.csv")[["uid"]]

mcq_scores = pd.read_csv("../data/mcq_scores.tsv", sep="\t")
mcq_scores = pd.merge(mcq_scores, valid_uids, on="uid", how="inner")
mcq_scores["mcq_class_kg"] = mcq_scores[["kg"]].apply(std_classification)
mcq_scores["mcq_class_pre"] = mcq_scores[["pre"]].apply(std_classification)
mcq_scores["mcq_class_post"] = mcq_scores[["post"]].apply(std_classification)

essay_scores = pd.read_csv("../data/essay_scores.csv")
essay_scores = pd.merge(essay_scores, valid_uids, on="uid", how="inner")
essay_scores["essay_class_kg"] = essay_scores[["kg"]].apply(std_classification)
essay_scores["essay_class_pre"] = essay_scores[["pre"]].apply(std_classification)
essay_scores["essay_class_post"] = essay_scores[["post"]].apply(std_classification)

In [42]:
import re
print(pd.merge(mcq_scores[["uid", "pre", "post", "kg"]], essay_scores[["uid", "pre", "post", "kg"]], on="uid", how="inner").rename(columns=lambda x: re.sub("_x", "_mcq", x)).rename(columns=lambda x: re.sub("_y", "_essay", x)).drop(columns=["uid"]).corr().round(2).to_latex(index=False, float_format="{:.2f}".format))

\begin{tabular}{rrrrrr}
\toprule
pre_mcq & post_mcq & kg_mcq & pre_essay & post_essay & kg_essay \\
\midrule
1.00 & 0.44 & -0.60 & 0.43 & 0.16 & -0.13 \\
0.44 & 1.00 & 0.46 & 0.26 & 0.32 & 0.13 \\
-0.60 & 0.46 & 1.00 & -0.19 & 0.12 & 0.24 \\
0.43 & 0.26 & -0.19 & 1.00 & 0.22 & -0.44 \\
0.16 & 0.32 & 0.12 & 0.22 & 1.00 & 0.78 \\
-0.13 & 0.13 & 0.24 & -0.44 & 0.78 & 1.00 \\
\bottomrule
\end{tabular}



# Deviation

In [34]:
print(mcq_scores[["mcq_class_pre", "uid"]].groupby("mcq_class_pre").count().reset_index())
print(essay_scores[["essay_class_pre", "uid"]].groupby("essay_class_pre").count().reset_index())
print(mcq_scores[["mcq_class_post", "uid"]].groupby("mcq_class_post").count().reset_index())
print(essay_scores[["essay_class_post", "uid"]].groupby("essay_class_post").count().reset_index())
print(mcq_scores[["mcq_class_kg", "uid"]].groupby("mcq_class_kg").count().reset_index())
print(essay_scores[["essay_class_kg", "uid"]].groupby("essay_class_kg").count().reset_index())

   mcq_class_pre  uid
0              0   63
1              1   43
   essay_class_pre  uid
0                0   57
1                1   49
   mcq_class_post  uid
0               0   54
1               1   52
   essay_class_post  uid
0                 0   54
1                 1   52
   mcq_class_kg  uid
0             0   66
1             1   40
   essay_class_kg  uid
0               0   46
1               1   60


In [35]:
pd.merge(mcq_scores[["uid", "mcq_class_pre"]], essay_scores[["uid", "essay_class_pre"]], on="uid", how="inner").pivot_table(index=["mcq_class_pre", "essay_class_pre"], aggfunc ='size')

mcq_class_pre  essay_class_pre
0              0                  42
               1                  21
1              0                  15
               1                  28
dtype: int64

In [36]:
pd.merge(mcq_scores[["uid", "mcq_class_post"]], essay_scores[["uid", "essay_class_post"]], on="uid", how="inner").pivot_table(index=["mcq_class_post", "essay_class_post"], aggfunc ='size')

mcq_class_post  essay_class_post
0               0                   30
                1                   24
1               0                   24
                1                   28
dtype: int64

In [37]:
pd.merge(mcq_scores[["uid", "mcq_class_kg"]], essay_scores[["uid", "essay_class_kg"]], on="uid", how="inner").pivot_table(index=["mcq_class_kg", "essay_class_kg"], aggfunc ='size')

mcq_class_kg  essay_class_kg
0             0                 32
              1                 34
1             0                 14
              1                 26
dtype: int64