In [1]:
import numpy as np
import pandas as pd
from datetime import date, datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,KFold, cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_auc_score
import pickle
from scipy import stats
import pylab

In [2]:
with open('df_train.pickle','rb') as f:
    df_train = pickle.load(f)

In [3]:
df_train = df_train[['ITEST_id', 'AveCorrect','skill','problemId','attemptCount','correct','consecutiveErrorsInRow','hint','hintCount','hintTotal','frIsHelpRequest','totalFrPercentPastWrong','frPast5WrongCount','frPast8WrongCount','RES_FRUSTRATED']]

In [4]:
len(df_train)

942816

In [5]:
# Student can skip questions without need to keep attempting on them

In [6]:
df_train.groupby(['ITEST_id','skill']).head()

Unnamed: 0,ITEST_id,AveCorrect,skill,problemId,attemptCount,correct,consecutiveErrorsInRow,hint,hintCount,hintTotal,frIsHelpRequest,totalFrPercentPastWrong,frPast5WrongCount,frPast8WrongCount,RES_FRUSTRATED
0,8,0.483902,properties-of-geometric-figures,104051118,1,0,0,1,1,1,1,0.000000,0,0,0.000000
1,8,0.483902,properties-of-geometric-figures,104051119,1,1,0,0,0,0,1,0.000000,0,0,0.000000
2,8,0.483902,sum-of-interior-angles-more-than-3-sides,104051120,1,0,0,0,0,0,0,0.000000,0,0,0.000000
3,8,0.483902,sum-of-interior-angles-more-than-3-sides,104051120,2,0,1,0,0,0,0,0.000000,0,0,0.000000
4,8,0.483902,sum-of-interior-angles-more-than-3-sides,104051121,1,1,0,0,0,1,0,1.000000,1,1,0.000000
5,8,0.483902,sum-of-interior-angles-more-than-3-sides,104051121,2,0,0,1,1,1,0,1.000000,1,1,0.000000
6,8,0.483902,sum-of-interior-angles-more-than-3-sides,104051121,3,1,0,0,1,1,0,1.000000,1,1,1.000000
7,8,0.483902,point-plotting,104051088,1,0,0,0,0,0,0,0.000000,0,0,0.000000
8,8,0.483902,transformations-rotations,104051089,1,0,0,0,0,0,1,0.000000,0,0,0.000000
9,8,0.483902,transformations-rotations,104051089,2,0,0,0,0,0,0,0.000000,0,0,0.000000


In [7]:
df_train.head(100)

Unnamed: 0,ITEST_id,AveCorrect,skill,problemId,attemptCount,correct,consecutiveErrorsInRow,hint,hintCount,hintTotal,frIsHelpRequest,totalFrPercentPastWrong,frPast5WrongCount,frPast8WrongCount,RES_FRUSTRATED
0,8,0.483902,properties-of-geometric-figures,104051118,1,0,0,1,1,1,1,0.000000,0,0,0.000000
1,8,0.483902,properties-of-geometric-figures,104051119,1,1,0,0,0,0,1,0.000000,0,0,0.000000
2,8,0.483902,sum-of-interior-angles-more-than-3-sides,104051120,1,0,0,0,0,0,0,0.000000,0,0,0.000000
3,8,0.483902,sum-of-interior-angles-more-than-3-sides,104051120,2,0,1,0,0,0,0,0.000000,0,0,0.000000
4,8,0.483902,sum-of-interior-angles-more-than-3-sides,104051121,1,1,0,0,0,1,0,1.000000,1,1,0.000000
5,8,0.483902,sum-of-interior-angles-more-than-3-sides,104051121,2,0,0,1,1,1,0,1.000000,1,1,0.000000
6,8,0.483902,sum-of-interior-angles-more-than-3-sides,104051121,3,1,0,0,1,1,0,1.000000,1,1,1.000000
7,8,0.483902,point-plotting,104051088,1,0,0,0,0,0,0,0.000000,0,0,0.000000
8,8,0.483902,transformations-rotations,104051089,1,0,0,0,0,0,1,0.000000,0,0,0.000000
9,8,0.483902,transformations-rotations,104051089,2,0,0,0,0,0,0,0.000000,0,0,0.000000


In [8]:
sids = df_train.ITEST_id.unique()
pids = df_train.problemId.unique()
skills = df_train.skill.unique()

In [9]:
df_train_correct_log = df_train.loc[df_train.correct==1]    
# only count the times when this student was eventually correct

df_train_correct_log.head()

Unnamed: 0,ITEST_id,AveCorrect,skill,problemId,attemptCount,correct,consecutiveErrorsInRow,hint,hintCount,hintTotal,frIsHelpRequest,totalFrPercentPastWrong,frPast5WrongCount,frPast8WrongCount,RES_FRUSTRATED
1,8,0.483902,properties-of-geometric-figures,104051119,1,1,0,0,0,0,1,0.0,0,0,0.0
4,8,0.483902,sum-of-interior-angles-more-than-3-sides,104051121,1,1,0,0,0,1,0,1.0,1,1,0.0
6,8,0.483902,sum-of-interior-angles-more-than-3-sides,104051121,3,1,0,0,1,1,0,1.0,1,1,1.0
10,8,0.483902,transformations-rotations,104051089,3,1,0,0,0,0,0,0.0,0,0,0.009561
11,8,0.483902,transformations-rotations,104051090,1,1,0,0,0,0,1,0.0,0,0,0.009561


In [10]:
overall_correctness = df_train.correct.value_counts()[1]/len(df_train)
overall_correctness

0.37268141397685234

In [11]:
problem_correctness = {pid:[0,0] for pid in pids}
problem_skippness = {pid:[0,0] for pid in pids}
for pid in pids:
    problem_log = df_train.loc[df_train.problemId==pid]
    value_count = problem_log.correct.value_counts()
    if 1 in value_count:
        problem_correctness[pid][0] = value_count[1]
    problem_correctness[pid][1] = len(problem_log)
    #for sid in sids:
        #student_problem_log = problem_log.loc[problem_log.ITEST_id==sid]
        #problem_skippness[pid][0]+=1
        #if not df_train.correct.value_counts()[1]:
            #problem_skippness[pid][1]+=1

In [12]:
problem_correctness

{104051118: [70, 634],
 104051119: [515, 918],
 104051120: [364, 1132],
 104051121: [540, 1431],
 104051088: [232, 825],
 104051089: [358, 1423],
 104051090: [328, 977],
 104051091: [280, 1626],
 104051092: [263, 978],
 104051074: [407, 768],
 104051204: [154, 793],
 104051205: [533, 1968],
 104051206: [503, 1188],
 104051207: [462, 1531],
 104051182: [115, 642],
 104051183: [422, 778],
 104051184: [373, 1378],
 104051185: [351, 883],
 104051186: [316, 1106],
 104051093: [343, 1064],
 104051094: [562, 1057],
 104051095: [502, 1234],
 104051096: [377, 1965],
 104051097: [350, 1115],
 104051098: [339, 842],
 104051099: [319, 479],
 104051100: [310, 587],
 104051101: [315, 472],
 104051122: [129, 733],
 104051125: [543, 1272],
 104051126: [590, 1721],
 104051113: [167, 845],
 104051215: [178, 904],
 104051216: [528, 2092],
 104051217: [485, 1739],
 104051218: [454, 1322],
 104051231: [119, 922],
 104051232: [528, 4948],
 104051233: [492, 1817],
 104051077: [257, 976],
 104051078: [562, 96

In [13]:
with open('question_difficulty.pickle','wb') as f:
    pickle.dump(problem_correctness,f)

In [64]:
low_students = df_train.loc[df_train.AveCorrect < df_train.AveCorrect.describe()['50%']]
low_student_ids = low_students.ITEST_id.unique()

In [68]:
len(low_student_ids)

670

In [58]:
problem_correct_rates = {}
for p,v in problem_correctness.items():
    if v[1]>=30:
        problem_correct_rates[p] = v[0]/v[1]
diff_threshold = np.quantile(list(problem_correct_rates.values()),0.5)

In [59]:
difficult_problems = [pid for pid in problem_correct_rates.keys() if problem_correct_rates[pid]>diff_threshold]

In [60]:
len(difficult_problems)

1418

In [155]:
df_train_low_difficult = df_train[df_train.ITEST_id.isin(low_student_ids) & df_train.problemId.isin(difficult_problems)]

In [148]:
# naively omit skipped problems, only count those ones that are eventually correct
df_train_low_difficult_correct = df_train_low_difficult.loc[df_train_low_difficult.correct==1]

In [149]:
len(df_train_low_difficult_correct)

67502

In [150]:
df_train_low_difficult.head()

Unnamed: 0,ITEST_id,AveCorrect,skill,problemId,attemptCount,correct,consecutiveErrorsInRow,hint,hintCount,hintTotal,frIsHelpRequest,totalFrPercentPastWrong,frPast5WrongCount,frPast8WrongCount,confidence(FRUSTRATED)
2484,64,0.334038,pattern-finding,104051191,1,1,0,0,0,0,1,0.0,0,0,0.091463
2485,64,0.334038,pattern-finding,104051192,1,1,0,0,0,0,1,0.0,0,0,0.091463
2486,64,0.334038,pattern-finding,104051193,1,1,0,0,0,0,1,0.0,0,0,0.091463
2487,64,0.334038,pattern-finding,104051194,1,1,0,0,0,0,1,0.0,0,0,0.091463
2496,64,0.334038,pattern-finding,104051191,2,1,0,0,0,0,0,0.0,0,0,0.091463


In [156]:
with open('low_diff_log.pickle','wb') as f:
    pickle.dump(df_train_low_difficult, f)

# Context: low abillity student facing with difficult problems
### intermediate goal: locate the resilient moment (of low ability students)
### moment: rolling on problem level, not transaction level

In [None]:
# ATTEMPTCOUNT as a latent feature

In [105]:
train, test = train_test_split(df_train_low_difficult_correct, test_size=0.4,random_state=22)

In [90]:
problem_dist = train.groupby(['problemId'])['attemptCount'].describe()
problem_dist

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
problemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,18.0,2.944444,1.893401,1.0,1.00,3.5,4.75,6.0
4,20.0,2.500000,1.432701,1.0,1.00,2.0,4.00,5.0
5,12.0,1.000000,0.000000,1.0,1.00,1.0,1.00,1.0
15,16.0,2.250000,1.483240,1.0,1.00,2.0,3.25,5.0
16,15.0,1.000000,0.000000,1.0,1.00,1.0,1.00,1.0
33,23.0,2.521739,0.994053,1.0,2.00,2.0,3.00,4.0
34,24.0,2.708333,2.176538,1.0,1.00,2.0,4.00,10.0
37,14.0,2.785714,1.672335,1.0,1.00,3.0,4.00,6.0
38,20.0,1.700000,1.174286,1.0,1.00,1.0,3.00,5.0
39,23.0,2.347826,1.228772,1.0,1.00,2.0,3.50,4.0


In [100]:
# define such rule: at 75% quantile of attempts compared to other students in this problem
resilient_rule_attempt_thresholds = {pid:problem_dist.loc[pid]['75%'] for pid in problem_dist.index}
problem_dist.loc[3]['50%']

4.75

In [101]:
resilient_rule_attempt_thresholds

{3: 4.75,
 4: 4.0,
 5: 1.0,
 15: 3.25,
 16: 1.0,
 33: 3.0,
 34: 4.0,
 37: 4.0,
 38: 3.0,
 39: 3.5,
 40: 2.0,
 41: 1.0,
 48: 3.0,
 49: 4.0,
 51: 4.0,
 58: 4.0,
 59: 3.5,
 73: 2.0,
 74: 1.0,
 83: 3.0,
 86: 1.0,
 112: 2.25,
 113: 2.5,
 114: 1.0,
 115: 1.0,
 116: 3.0,
 117: 1.0,
 119: 3.5,
 120: 1.0,
 127: 2.0,
 128: 3.0,
 132: 3.0,
 134: 3.0,
 140: 3.0,
 141: 3.0,
 142: 2.0,
 143: 3.0,
 163: 2.0,
 172: 4.0,
 177: 4.0,
 178: 1.0,
 184: 3.0,
 185: 1.0,
 202: 4.5,
 204: 4.0,
 219: 3.0,
 221: 4.0,
 222: 3.0,
 223: 1.0,
 227: 5.0,
 228: 3.5,
 229: 3.0,
 230: 4.0,
 232: 3.0,
 233: 1.75,
 234: 5.0,
 235: 3.0,
 237: 3.0,
 238: 4.0,
 247: 4.0,
 248: 1.0,
 249: 2.0,
 252: 1.0,
 326: 2.0,
 327: 2.0,
 328: 2.75,
 329: 3.0,
 330: 1.0,
 341: 2.0,
 342: 1.0,
 346: 3.5,
 354: 2.0,
 355: 3.0,
 357: 2.0,
 358: 5.0,
 361: 3.0,
 364: 2.0,
 365: 1.0,
 367: 3.0,
 368: 3.0,
 369: 1.0,
 395: 2.0,
 399: 3.0,
 408: 2.0,
 433: 5.0,
 435: 3.0,
 437: 3.0,
 438: 3.0,
 439: 4.0,
 440: 3.0,
 441: 1.0,
 442: 4.0,
 443: 4

In [106]:
train.head()

Unnamed: 0,ITEST_id,AveCorrect,skill,problemId,attemptCount,correct,consecutiveErrorsInRow,frIsHelpRequest,totalFrPercentPastWrong,frPast5WrongCount,frPast8WrongCount
212902,5628,0.334746,pythagorean-theorem,104051178,1,1,0,1,1.0,1,1
46684,4685,0.322816,noskill,3364,2,1,0,0,0.191176,3,4
7623,3863,0.302072,scientific-notation,2223,2,1,0,0,0.285714,1,2
243196,6057,0.219382,square-root,104051173,4,1,0,0,0.0,0,0
37889,7076,0.315029,symbolization-articulation,440,1,1,0,1,0.166667,0,1


In [108]:
test.head()

Unnamed: 0,ITEST_id,AveCorrect,skill,problemId,attemptCount,correct,consecutiveErrorsInRow,frIsHelpRequest,totalFrPercentPastWrong,frPast5WrongCount,frPast8WrongCount
101521,1450,0.326954,reading-graph,104050909,3,1,0,0,0.2,1,2
88080,1317,0.280645,square-root,104051073,1,1,0,1,0.125,0,1
178851,2570,0.272639,interpreting-numberline,104050208,1,1,0,0,0.0,0,0
85468,3731,0.301587,symbolization-articulation,104050263,2,1,0,0,0.25,2,2
147621,4582,0.27182,mean,104050918,2,1,0,0,0.25,1,1


In [116]:
y_pred = np.zeros(len(test))
j = 0
for i, row in test.iterrows():
    try:
        if row.attemptCount>=resilient_rule_attempt_thresholds[row.problemId]:
            y_pred[j] = 1
    except KeyError:
        print('key error') # by default, set to false
    j+=1

key error
key error
key error
key error
key error
key error
key error
key error
key error
key error
key error
key error


In [115]:
problem_dist_test = test.groupby(['problemId'])['attemptCount'].describe()
resilient_threshold_test_gt = {pid:problem_dist_test.loc[pid]['75%'] for pid in problem_dist_test.index}

In [117]:
y_test = np.zeros(len(test))
j = 0
for i, row in test.iterrows():
    try:
        if row.attemptCount>=resilient_threshold_test_gt[row.problemId]:
            y_test[j] = 1
    except KeyError:
        print('key error') # by default, set to false
    j+=1

In [119]:
compare = [i for i in range(len(test)) if y_test[i]==y_pred[i]]
len(compare)/len(test)

0.885374615754972

In [120]:
resilience_median = {pid:problem_dist.loc[pid]['50%'] for pid in problem_dist.index}

In [121]:
train['attemptDeviation'] = np.zeros(len(train))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [124]:
for i,row in train.iterrows():
    train.at[i,'attemptDeviation'] = row.attemptCount - resilience_median[row.problemId]

In [125]:
train.head()

Unnamed: 0,ITEST_id,AveCorrect,skill,problemId,attemptCount,correct,consecutiveErrorsInRow,frIsHelpRequest,totalFrPercentPastWrong,frPast5WrongCount,frPast8WrongCount,attemptDeviation
212902,5628,0.334746,pythagorean-theorem,104051178,1,1,0,1,1.0,1,1,-1.0
46684,4685,0.322816,noskill,3364,2,1,0,0,0.191176,3,4,1.0
7623,3863,0.302072,scientific-notation,2223,2,1,0,0,0.285714,1,2,-1.0
243196,6057,0.219382,square-root,104051173,4,1,0,0,0.0,0,0,1.0
37889,7076,0.315029,symbolization-articulation,440,1,1,0,1,0.166667,0,1,-2.0


In [126]:
# frPast5WrongCount as a feature

In [128]:
problem_dist_p5wc = train.groupby(['problemId'])['frPast5WrongCount'].describe()
problem_dist_p5wc

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
problemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,18.0,1.111111,0.676400,0.0,1.00,1.0,1.00,3.0
4,20.0,0.050000,0.223607,0.0,0.00,0.0,0.00,1.0
5,12.0,0.500000,0.674200,0.0,0.00,0.0,1.00,2.0
15,16.0,0.875000,0.619139,0.0,0.75,1.0,1.00,2.0
16,15.0,0.533333,0.639940,0.0,0.00,0.0,1.00,2.0
33,23.0,0.782609,0.518435,0.0,0.50,1.0,1.00,2.0
34,24.0,0.750000,0.442326,0.0,0.75,1.0,1.00,1.0
37,14.0,0.857143,0.534522,0.0,1.00,1.0,1.00,2.0
38,20.0,0.000000,0.000000,0.0,0.00,0.0,0.00,0.0
39,23.0,0.000000,0.000000,0.0,0.00,0.0,0.00,0.0


In [129]:
# define such rule: at 75% quantile of attempts compared to other students in this problem
resilient_rule_p5wc_thresholds = {pid:problem_dist_p5wc.loc[pid]['75%'] for pid in problem_dist_p5wc.index}
problem_dist_p5wc.loc[3]['50%']

1.0

In [None]:
train.loc[train.frPast5WrongCount>2