In [9]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from datascience import *
from pprint import pprint
import skimage as ski
import skimage.transform as sktrans
import skimage.io as skio
sns.set()

In [10]:
!ls

02-prediction.ipynb          hmm_data.tsv
format_one.tsv               index.html
format_two.tsv               model.hmm
format_two_r5_prediction.tsv pred.hmm


In [11]:
!head format_two.tsv

Anon Student Id	r0	r1	r2	r3	r4	r5
Stu_02ee1b3f31a6f6a7f4b8012298b2395e	1	1	0	1	1	1
Stu_1afbee9e3e83bb801b589108fb46028a	1	1	1	1	1	1
Stu_1b394ff128c045b7d4ad4f6e83933b72	1	1	1	1	1	1
Stu_1d6b91d4b593d7e4f5622275b3295663	1	0	1	1	0	0
Stu_2ebe6a7530ff11f2c0b9b807faf0a0a3	1	1	1	1	1	1
Stu_4e8a5be7f2663d2ccccfb59c684d5452	0	0	1	0	0	1
Stu_58d8c63d6e15a65f05b2e59e0a285d8f	1	0	1	1	1	1
Stu_671ccc3ef2c091b6b6d76b34e3d19a8f	1	1	1	1	1	1
Stu_706a76f06dfa563c7ea573d994ca5405	1	1	1	1	1	1


In [12]:
!head format_two_r5_prediction.tsv

Anon Student Id	r5
Stu_02ee1b3f31a6f6a7f4b8012298b2395e	0
Stu_1afbee9e3e83bb801b589108fb46028a	0
Stu_1b394ff128c045b7d4ad4f6e83933b72	0
Stu_1d6b91d4b593d7e4f5622275b3295663	0
Stu_2ebe6a7530ff11f2c0b9b807faf0a0a3	0
Stu_4e8a5be7f2663d2ccccfb59c684d5452	0
Stu_58d8c63d6e15a65f05b2e59e0a285d8f	0
Stu_671ccc3ef2c091b6b6d76b34e3d19a8f	0
Stu_706a76f06dfa563c7ea573d994ca5405	0


In [13]:
df = pd.read_csv('format_two.tsv', delimiter='\t')
df.head()

Unnamed: 0,Anon Student Id,r0,r1,r2,r3,r4,r5
0,Stu_02ee1b3f31a6f6a7f4b8012298b2395e,1,1,0,1,1,1
1,Stu_1afbee9e3e83bb801b589108fb46028a,1,1,1,1,1,1
2,Stu_1b394ff128c045b7d4ad4f6e83933b72,1,1,1,1,1,1
3,Stu_1d6b91d4b593d7e4f5622275b3295663,1,0,1,1,0,0
4,Stu_2ebe6a7530ff11f2c0b9b807faf0a0a3,1,1,1,1,1,1


In [14]:
data = df[['r0', 'r1', 'r2', 'r3', 'r4', 'r5']].as_matrix()
data[:5]

array([[1, 1, 0, 1, 1, 1],
       [1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1],
       [1, 0, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1]])

In [15]:
cols_to_avg = [np.arange(5 - i, 5) for i in range(1, 6)]
cols_to_avg

[array([4]),
 array([3, 4]),
 array([2, 3, 4]),
 array([1, 2, 3, 4]),
 array([0, 1, 2, 3, 4])]

In [16]:
avgs = [data[:, cols].mean(axis=1) for cols in cols_to_avg]
avgs[2]

array([ 0.66666667,  1.        ,  1.        ,  0.66666667,  1.        ,
        0.33333333,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  1.        ,  0.66666667,  0.33333333,
        1.        ,  0.66666667,  1.        ,  1.        ,  0.33333333,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  0.66666667,  1.        ])

In [17]:
predictions = np.round(avgs)
predictions[2]

array([ 1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.])

In [18]:
n = data.shape[0]
accuracies = [np.count_nonzero(pred == data[:, 5]) / n for pred in predictions]
accuracies

[0.7931034482758621,
 0.7586206896551724,
 0.7931034482758621,
 0.8275862068965517,
 0.7931034482758621]

In [19]:
for k, accuracy in enumerate(accuracies):
    print('Acc using last {} columns: {}'.format(k + 1, accuracy))

Acc using last 1 columns: 0.7931034482758621
Acc using last 2 columns: 0.7586206896551724
Acc using last 3 columns: 0.7931034482758621
Acc using last 4 columns: 0.8275862068965517
Acc using last 5 columns: 0.7931034482758621


Looks like averaging the last 4 answers was the best on this data with an accuracy of 0.8276.

## Using BKT

We used the C++ MKT implementation to make predictions on the same dataset:

In [20]:
pred = pd.read_csv('pred.hmm', delimiter='\t', header=None, names=['correct', 'incorrect'])
pred.head()

Unnamed: 0,correct,incorrect
0,0.50126,0.49874
1,0.728502,0.271498
2,0.788275,0.211725
3,0.773382,0.226618
4,0.795927,0.204073


In [29]:
bkt_predictions = np.round(pred['correct'][::5])
bkt_predictions.head()

0     1.0
5     1.0
10    1.0
15    1.0
20    1.0
Name: correct, dtype: float64

In [30]:
len(bkt_predictions)

29

In [32]:
bkt_acc = np.count_nonzero(bkt_predictions == data[:, 5]) / n
print('BKT Accuracy:', bkt_acc)

BKT Accuracy: 0.8275862068965517


So BKT did as well as averaging the last 4 predictions.