In [1]:
import numpy as np
import pandas as pd
from datetime import date, datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,KFold, cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_auc_score,cohen_kappa_score
import pickle
from scipy import stats
import pylab
from scipy.stats import pearsonr
from scipy.spatial.distance import euclidean

In [2]:
with open('low_diff_log.pickle','rb') as f:
    df = pickle.load(f)

In [70]:
# limitation: 
# unsupervised, no ground truth, only face validity
# vaguely defined columns
# we assume their ability are around the same level in some cases

In [71]:
#assume context: low ability, difficult problem
#assume three rules
# analyze on student-step level

In [187]:
# rules:
# 1. whether they have more than normal (over 75% quantile) attempt counts over this question by this step
# 2. whether they skip a problem at this step, in less than normal attempt counts (less than 50% quantile)
# 3. whether they ask for hint at this step, in less than normal attempt counts (less than 50% quantile)

In [4]:
df = df.reset_index()

In [55]:
problem_dist = df.groupby(['problemId'])['attemptCount'].describe()

In [56]:
problem_dist

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
problemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,95.0,2.421053,1.372829,1.0,1.00,2.0,3.50,6.0
4,85.0,2.094118,1.150813,1.0,1.00,2.0,3.00,5.0
5,57.0,1.000000,0.000000,1.0,1.00,1.0,1.00,1.0
15,59.0,2.084746,1.149035,1.0,1.00,2.0,3.00,5.0
16,48.0,1.000000,0.000000,1.0,1.00,1.0,1.00,1.0
33,88.0,1.806818,0.882238,1.0,1.00,2.0,2.00,4.0
34,108.0,2.500000,1.748163,1.0,1.00,2.0,3.00,10.0
37,109.0,2.522936,1.424627,1.0,1.00,2.0,4.00,7.0
38,59.0,1.677966,0.954855,1.0,1.00,1.0,2.00,5.0
39,82.0,1.963415,0.999322,1.0,1.00,2.0,3.00,4.0


In [70]:
#problem_dist
# define such rule: at 75% quantile of attempts compared to other students in this problem
resilient_rule_attempt_thresholds = {pid:problem_dist.loc[pid]['75%'] for pid in problem_dist.index}
resilient_rule_attempt_avg = {pid:problem_dist.loc[pid]['mean'] for pid in problem_dist.index}
resilient_rule_attempt_outlier = {pid:problem_dist.loc[pid]['mean']+2*problem_dist.loc[pid]['std'] for pid in problem_dist.index}
#problem_dist.loc[3]['50%']
df['attemptDeviation'] = np.zeros(len(df))
for i,row in df.iterrows():
    try:
        if row.attemptCount>resilience_avg[row.problemId]:
            if row.attemptCount>resilient_rule_attempt_thresholds[row.problemId]:
                if row.attemptCount>resilient_rule_attempt_outlier[row.problemId]:
                    res = 3
                else:
                    res = 2
            else:
                res = 1
        else:
            res = 0
        #res = max(0, row.attemptCount - resilience_avg[row.problemId])
        df.at[i,'attemptDeviation'] = res
    except KeyError:
        print(i,'never corrected answered by these students')

In [71]:
df['skip'] = np.zeros(len(df))

In [72]:
for i,row in df.iterrows():
    if i<len(df)-1:
        if df.loc[i,'correct']==0 and i<len(df) and df.loc[i+1,'problemId']!=df.loc[i,'problemId'] and df.loc[i,'ITEST_id']==df.loc[i+1,'ITEST_id']:
            df.at[i,'skip'] = 3

In [73]:
sids = df.ITEST_id.unique()

In [74]:
obs = df.loc[:,['attemptDeviation','skip','hintCount']].values.tolist()

In [11]:
lengths = []
for sid in sids:
    lengths.append(len(df.loc[df.ITEST_id==sid]))

In [12]:
from hmmlearn import hmm

In [75]:
remodel = hmm.GaussianHMM(n_components=5, n_iter=500).fit(obs, lengths)

In [76]:
hidden = remodel.predict(obs)

In [77]:
df['hidden_res_level'] = hidden

In [78]:
df.head()

Unnamed: 0,level_0,index,ITEST_id,AveCorrect,skill,problemId,attemptCount,correct,consecutiveErrorsInRow,hint,hintCount,hintTotal,frIsHelpRequest,totalFrPercentPastWrong,frPast5WrongCount,frPast8WrongCount,RES_FRUSTRATED,attemptDeviation,skip,hidden_res_level
0,0,2484,64,0.334038,pattern-finding,104051191,1,1,0,0,0,0,1,0.0,0,0,0.009561,0.0,0.0,4
1,1,2485,64,0.334038,pattern-finding,104051192,1,1,0,0,0,0,1,0.0,0,0,0.009561,0.0,0.0,4
2,2,2486,64,0.334038,pattern-finding,104051193,1,1,0,0,0,0,1,0.0,0,0,0.009561,0.0,0.0,4
3,3,2487,64,0.334038,pattern-finding,104051194,1,1,0,0,0,0,1,0.0,0,0,0.009561,0.0,0.0,4
4,4,2496,64,0.334038,pattern-finding,104051191,2,1,0,0,0,0,0,0.0,0,0,0.009561,0.0,0.0,4


In [42]:
df = df.reset_index()

ValueError: cannot insert level_0, already exists

In [110]:
def agreement_measure(x,y):
    x = np.interp(x, (x.min(), x.max()), (-1, +1))
    #y = np.interp(y, (y.min(), y.max()), (-1, +1)) # y is already a rescaled value
    return euclidean(x,y)/len(x)

In [131]:
avg_agree = []
for sid in sids:
    df_sample = df.loc[df.ITEST_id==sid].rename(columns={"index": "old_index"})
    df_sample.reset_index(drop=True,inplace=True)
    df_sample = df_sample.reset_index()
    agree = agreement_measure(df_sample.hidden_res_level.values,df_sample.RES_FRUSTRATED.values)
    print(sid, agree)
    avg_agree.append(agree)
print(np.sum(avg_agree)/len(sids))

0.08250258287170137


In [92]:
with open('hmm_res_model_tuned.pickle','wb') as f:
    pickle.dump(df,f)