In [1]:
import pandas as pd
import numpy as np
import json

filename = 'skill_builder_data_corrected.csv'
df = pd.read_csv(filename, encoding='ISO-8859-1', low_memory=False)
df = df[(df['original'] == 1) & (df['attempt_count'] == 1) & ~(df['skill_name'].isnull())]

In [34]:
response_df.head()

Unnamed: 0,student_id,r0,r1,r2,r3,r4,r5,r6,r7,r8,...,r90,r91,r92,r93,r94,r95,r96,r97,r98,r99
0,64525,1,1,1,1,1,1,1,1,0,...,1,1,1,1,1,1,1,1,1,1
1,70363,1,1,1,1,0,1,1,1,1,...,1,1,1,1,1,1,1,0,0,1
2,70677,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,1,1
3,70695,1,1,0,0,1,1,0,1,1,...,1,1,1,0,1,1,0,0,0,1
4,70699,1,1,0,1,1,1,1,1,1,...,1,1,1,0,0,0,1,1,1,0


In [17]:
skill_df.head()

Unnamed: 0,student_id,s0,s1,s2,s3,s4,s5,s6,s7,s8,...,s90,s91,s92,s93,s94,s95,s96,s97,s98,s99
0,64525,1,1,2,2,2,2,2,2,2,...,15,15,15,15,16,16,17,18,18,18
1,70363,1,1,1,1,2,2,2,2,2,...,8,8,8,8,8,9,9,9,9,9
2,70677,1,1,5,5,5,5,5,5,5,...,12,12,12,12,12,12,12,12,12,15
3,70695,1,1,3,7,7,7,7,7,7,...,33,33,33,33,33,33,33,33,33,33
4,70699,1,1,2,6,6,6,6,6,6,...,42,42,42,44,44,44,45,46,46,46


In [2]:
response_df = pd.read_csv('correct.tsv', sep='\t').drop('Unnamed: 0', axis=1)
skill_df = pd.read_csv('skill.tsv', sep='\t').drop('Unnamed: 0', axis=1)
assistment_df = pd.read_csv('assistment_id.tsv', sep='\t').drop('Unnamed: 0', axis=1)
skill_dict = {}
with open('skill_dict.json', 'r', encoding='utf-8') as f:
    loaded = json.load(f)
    for k, v in loaded.items():
        skill_dict[k] = int(v)

skill_num = len(skill_dict) + 1 # including 0

def one_hot(skill_matrix, vocab_size):
    '''
    params:
        skill_matrix: 2-D matrix (student, skills)
        vocal_size: size of the vocabulary
    returns:
        a ndarray with a shape like (student, sequence_len, vocab_size)
    '''
    seq_len = skill_matrix.shape[1]
    result = np.zeros((skill_matrix.shape[0], seq_len, vocab_size))
    for i in range(skill_matrix.shape[0]):
        result[i, np.arange(seq_len), skill_matrix[i]] = 1.
    return result

def dkt_one_hot(skill_matrix, response_matrix, vocab_size):
    seq_len = skill_matrix.shape[1]
    skill_response_array = np.zeros((skill_matrix.shape[0], seq_len, 2 * vocab_size))
    for i in range(skill_matrix.shape[0]):
        skill_response_array[i, np.arange(seq_len), 2 * skill_matrix[i] + response_matrix[i]] = 1.
    return skill_response_array

def preprocess(skill_df, response_df, skill_num):
    skill_matrix = skill_df.iloc[:, 1:].values
    response_array = response_df.iloc[:, 1:].values
    skill_array = one_hot(skill_matrix, skill_num)
    skill_response_array = dkt_one_hot(skill_matrix, response_array, skill_num)
    return skill_array, response_array, skill_response_array
    

skill_array, response_array, skill_response_array = preprocess(skill_df, response_df, skill_num)

In [38]:
len(skill_dict)

110

In [37]:
response_array.shape

(584, 100)

In [26]:
skill_array.shape

(584, 100, 111)

In [32]:
skill_response_array.shape

(584, 100, 222)

In [3]:
import keras
from keras.layers import Input, Dense, LSTM, TimeDistributed, Lambda, multiply
from keras.models import Model
from keras.optimizers import RMSprop, Adam
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K

def build_skill2skill_model(input_shape, lstm_dim=32, dropout=0.0):
    input = Input(shape=input_shape, name='input skills')
    lstm = LSTM(lstm_dim, 
                return_sequences=True, 
                dropout=dropout,
                name='lstm layer')(input)
    output = TimeDistributed(Dense(input_shape[-1], activation='softmax'), name='probability')(lstm)
    model = Model(inputs=[input], outputs=[output])
    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, decay=0.0)
    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

def reduce_dim(x):
    x = K.max(x, axis=-1, keepdims=True)
    return x

def build_dkt_model(input_shape, lstm_dim=32, dropout=0.0):
    input_skills = Input(shape=input_shape, name='input skills')
    lstm = LSTM(lstm_dim, 
                return_sequences=True, 
                dropout=dropout,
                name='lstm layer')(input_skills)
    dense = TimeDistributed(Dense(int(input_shape[-1]/2), activation='sigmoid'), name='probability for each')(lstm)
    
    skill_next = Input(shape=(input_shape[0], int(input_shape[1]/2)), name='next_skill_tested')
    merged = multiply([dense, skill_next], name='multiply')
    reduced = Lambda(reduce_dim, output_shape=(input_shape[0], 1), name='reduce dim')(merged)
    
    model = Model(inputs=[input_skills, skill_next], outputs=[reduced])
    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, decay=0.0)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

print('skill2skill')
skill2skill_model = build_skill2skill_model((99, skill_num), lstm_dim=64)

print('dkt')
dkt_model = build_dkt_model((99, 2 * skill_num), lstm_dim=64)
    

Using Theano backend.


skill2skill
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input skills (InputLayer)    (None, 99, 111)           0         
_________________________________________________________________
lstm layer (LSTM)            (None, 99, 64)            45056     
_________________________________________________________________
probability (TimeDistributed (None, 99, 111)           7215      
Total params: 52,271
Trainable params: 52,271
Non-trainable params: 0
_________________________________________________________________
dkt
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input skills (InputLayer)        (None, 99, 222)       0                                            
___________________________________________________________________________________

In [41]:
%%time

# train skill2skill
skill2skill_model.fit(skill_array[:, 0:-1],
                      skill_array[:, 1:],
                      epochs=20, 
                      batch_size=32, 
                      shuffle=True,
                      validation_split=0.2)

Train on 467 samples, validate on 117 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 1min 15s, sys: 5.39 s, total: 1min 20s
Wall time: 2min 24s


<keras.callbacks.History at 0x129ea1d68>

In [42]:
%%time

dkt_model.fit([skill_response_array[:, 0:-1], skill_array[:, 1:]],
              response_array[:, 1:, np.newaxis],
              epochs=20, 
              batch_size=32, 
              shuffle=True,
              validation_split=0.2)

Train on 467 samples, validate on 117 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 1min 22s, sys: 7.04 s, total: 1min 29s
Wall time: 1min 31s


<keras.callbacks.History at 0x12c76ae48>

## Question 1

What were the 5 most common and 5 least common skills in this dataset? What percentage of responses are associated with the most common skill?

In [55]:
skillname_df = pd.DataFrame(list(skill_dict.items()), columns=['Name', 'ID']).set_index('ID')
skillname_df.head()

Unnamed: 0_level_0,Name
ID,Unnamed: 1_level_1
95,Write Linear Equation from Graph
17,Interior Angles Triangle
41,Least Common Multiple
9,Mean
91,Computation with Real Numbers


In [73]:
skill_counts = (skill_df
 .iloc[:, 1:]
 .unstack()
 .value_counts()
 .rename('count')
 .to_frame()
 .join(skillname_df)
)

In [74]:
print('Top 5 skills:')
skill_counts.head()

Top 5 skills:


Unnamed: 0,count,Name
7,4579,Table
30,4379,Conversion of Fraction Decimals Percents
8,3466,Venn Diagram
2,3404,Circle Graph
33,2833,Ordering Fractions


In [75]:
print('Bottom 5 skills:')
skill_counts.tail()

Bottom 5 skills:


Unnamed: 0,count,Name
83,3,Volume Rectangular Prism
84,2,Volume Sphere
109,2,Solving Inequalities
82,2,Volume Cylinder
80,1,Surface Area Cylinder


In [77]:
print('Proportion of responses for most common skill:')
skill_counts.iloc[0, 0] / skill_counts['count'].sum()

Proportion of responses for most common skill:


0.078407534246575344

## Question 2

Train the sequence prediction model using a randomly selected 70% (training set) of students' data and predict on the remaining 30% (test set). What was the overall accuracy of skill prediction in the test set? What were the top 5 hardest and easiest to predict skills? Describe the metric you chose to represent hard/easy prediction. 

In [84]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(skill_array[:, 0:-1], skill_array[:, 1:], test_size=0.3)
X_train.shape, X_test.shape

((408, 99, 111), (176, 99, 111))

In [None]:
%%time



# train skill2skill
skill2skill_model.fit(...,
                      ...,
                      epochs=20, 
                      batch_size=32, 
                      shuffle=True,
                      validation_split=0.2)

## Question 3

Modify parameters of the network to increase accuracy (e.g. number of hidden nodes, optimizer, number of RNN layers, number of epochs, creating a validation set and stopping training when the validation set accuracy decreases). What were your accuracy results with respect to the hyper parameters you tuned?

## Question 4

Train a performance prediction model (DKT) using the same 70/30% split and report the accuracy and AUC of prediction on the 30%

## Question 5

Tune the hyper parameters of this model to improve accuracy and report your improvement with respect to the tuned parameters. Which lead to the most significant improvement?

## Question 6

[Extra credit 1 (worth 30%)]: Repeat the model experiements above but using 5-fold cross-validation at the student level instaed of a single train/test hold out.

## Question 7

[Extra credit 2 (worth additional 30%)]: Modify either DKT or sequence predition model to predict the skill and the correctness given skill_correctness as the input sequence. Train using 70/30 or cross-validation and tune the parameters of this model. How does the accuracy of this model compare to the DKT and sequence prediction models?

## Question 8

[Extra credit 3 (worth 30%)]: Re-create the DKT skill pre-requisite analysis by creating a skill by skill matrix with the influence calculations for each. Use your choice of cutoff and software to create the pre-requsite skill graph visualization.

## Question 9

[Extra credit 4 (worth 30%)]: Re-tune the sequence and performance predictions models to use the assignment ID instead of skill ID as the input. Keep the output the same as it was before. Compare accuracies bewteen the two different inputs for each model.