In [4]:
import os
import sys
import numpy as np

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Masking, Dense
from keras.layers.recurrent import LSTM

from keras import backend as K
from sklearn.metrics import roc_auc_score

import tensorflow as Th

import random
import math
import argparse
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib import style

from keras.layers.wrappers import TimeDistributed

def read_file(dataset_path):
    seqs_by_student = {}
    problem_ids = {}
    next_problem_id = 0
    with open(dataset_path, 'r') as f:
        for line in f:
            student, problem, is_correct = line.strip().split(' ')
            student = int(student)
            if student not in seqs_by_student:
                seqs_by_student[student] = []
            if problem not in problem_ids:
                problem_ids[problem] = next_problem_id
                next_problem_id += 1
            seqs_by_student[student].append((problem_ids[problem], int(is_correct == '1')))
    
    sorted_keys = sorted(seqs_by_student.keys())
    return [seqs_by_student[k] for k in sorted_keys], next_problem_id


def load_dataset(dataset, split_file):
    seqs, num_skills = read_file(dataset)
    
    with open(split_file, 'r') as f:
        student_assignment = f.read().split(' ')
    
    print(seqs)
    training_seqs = [seqs[i] for i in xrange(0, len(seqs)) if student_assignment[i] == '1']
    testing_seqs = [seqs[i] for i in xrange(0, len(seqs)) if student_assignment[i] == '0']

def main():
    
    dataset = '/Users/subleenkaur/Downloads/2012-2013-data-with-predictions-4-final/my_assistments.txt'
    split_file = '/Users/subleenkaur/Downloads/2012-2013-data-with-predictions-4-final/my_assistments_split.txt'
    hidden_units = 128
    batch_size = 5
    time_window = 100
    epochs = 50

    model_file = dataset + '.model_weights'
    history_file = dataset + '.history'
    preds_file = dataset + '.preds'

    overall_loss = [0.0]
    preds = []
    history = []
    
    training_seqs = ['1','2','4']
    testing_seqs = ['3','5']
    
    print(type(overall_loss))

    num_skills = 3
    training_seqs = len(training_seqs)
    testing_seqs = len(testing_seqs)
    
    # load dataset
    #training_seqs, testing_seqs, num_skills = load_dataset(dataset, split_file)
    #print "Training Sequences: %d" % len(training_seqs)
    #print "Testing Sequences: %d" % len(testing_seqs)
    #print "Number of skills: %d" % num_skills
    
    def loss_function(y_true, y_pred):
        skill = y_true[:,:,0:num_skills]
        obs = y_true[:,:,num_skills]
        rel_pred = Th.reduce_sum(y_pred * skill, 2)
        
        # keras implementation does a mean on the last dimension (axis=-1) which
        # it assumes is a singleton dimension. But in our context that would
        # be wrong.
        return K.binary_crossentropy(rel_pred, obs)
    
    # build model
    model = Sequential()
    
    # ignore padding
    model.add(Masking(-1.0, batch_input_shape=(batch_size, time_window, 3*2)))

    # lstm configured to keep states between batches
    model.add(LSTM(input_dim = num_skills*2, 
                   output_dim = hidden_units, 
                   return_sequences=True,
                   batch_input_shape=(batch_size, time_window, 3*2),
                   stateful = True
    ))
    
    # readout layer. TimeDistributedDense uses the same weights for all
    # time steps.
    model.add(TimeDistributed(Dense(input_dim = hidden_units, 
        output_dim = num_skills, activation='sigmoid')))
   


    # optimize with rmsprop which dynamically adapts the learning
    # rate of each weight.
    model.compile(loss=loss_function,optimizer='rmsprop')
  

    # training function
    def trainer(X, Y):
        print(model.train_on_batch(X,Y))
        val = np.asscalar(model.train_on_batch(X,Y))
        overall_loss[0] += val
  

    # prediction
    def predictor(X, Y):
        batch_activations = model.predict_on_batch(X)
        skill = Y[:,:,0:num_skills]
        obs = Y[:,:,num_skills]
        y_pred = np.squeeze(np.array(batch_activations))
        
        rel_pred = np.sum(y_pred * skill, axis=2)
        
        for b in xrange(0, X.shape[0]):
            for t in xrange(0, X.shape[1]):
                if X[b, t, 0] == -1.0:
                    continue
                preds.append((rel_pred[b][t], obs[b][t]))
    
    
    
    # call when prediction batch is finished
    # resets LSTM state because we are done with all sequences in the batch
    def finished_prediction_batch(percent_done):
        model.reset_states()
    
    
    # similiar to the above
    def finished_batch(percent_done):
        print "(%4.3f %%) %f" % (percent_done, overall_loss[0])
        model.reset_states()
    
    
    # run the model
    for e in xrange(0, epochs):
        model.reset_states()
        
        # train
        run_func(training_seqs, num_skills, trainer, batch_size, time_window, finished_batch)
        
        model.reset_states()
        
        # test
        run_func(testing_seqs, num_skills, predictor, batch_size, time_window, finished_prediction_batch)
        
        # compute AUC
        auc = roc_auc_score([p[1] for p in preds], [p[0] for p in preds])
        
        # log
        history.append((overall_loss[0], auc))
        
        # save model
        model.save_weights(model_file, overwrite=True)
        print "==== Epoch: %d, Test AUC: %f" % (e, auc)
        
        # reset loss
        overall_loss[0] = 0.0
        
        # save predictions and plot graph
        with open(preds_file, 'w') as f:
            f.write('was_heldout\tprob_recall\tstudent_recalled\n')
            for pred in preds:
                f.write('1\t%f\t%d\n' % (pred[0], pred[1]))
        
        with open(history_file, 'w') as f:
            for h in history:
                f.write('\t'.join([str(he) for he in h]))
                f.write('\n')
        
        # clear preds
        preds = []
        
def run_func(seqs, num_skills, f, batch_size, time_window, batch_done = None):
    
    print('seqs')
    print(seqs)
    seqs =[[(0, 1), (1, 1), (2, 0)], 
           [(0, 0), (1, 1), (2, 1)], 
           [(0, 1), (1, 0), (2, 1)], 
           [(0, 1), (1, 1), (1, 1)], 
           [(0, 0), (1, 0), (2, 1)]]
    
    assert(min([len(s) for s in seqs]) > 0)
    
    # randomize samples
    seqs = seqs[:]
    random.shuffle(seqs)
    
    processed = 0
    for start_from in xrange(0, len(seqs), batch_size):
       end_before = min(len(seqs), start_from + batch_size)
       x = []
       y = []
       for seq in seqs[start_from:end_before]:
           x_seq = []
           y_seq = []
           xt_zeros = [0 for i in xrange(0, num_skills*2)]
           ct_zeros = [0 for i in xrange(0, num_skills+1)]
           xt = xt_zeros[:]
           for skill, is_correct in seq:
               x_seq.append(xt)
               
               ct = ct_zeros[:]
               ct[skill] = 1
               ct[num_skills] = is_correct
               y_seq.append(ct)
               
               # one hot encoding of (last_skill, is_correct)
               pos = skill * 2 + is_correct
               xt = xt_zeros[:]
               xt[pos] = 1
               
           x.append(x_seq)
           y.append(y_seq)
       
       maxlen = max([len(s) for s in x])
       maxlen = round_to_multiple(maxlen, time_window)
       # fill up the batch if necessary
       if len(x) < batch_size:
            for e in xrange(0, batch_size - len(x)):
                x_seq = []
                y_seq = []
                for t in xrange(0, time_window):
                    x_seq.append([-1.0 for i in xrange(0, num_skills*2)])
                    y_seq.append([0.0 for i in xrange(0, num_skills+1)])
                x.append(x_seq)
                y.append(y_seq)
        
       X = pad_sequences(x, padding='post', maxlen = maxlen, dim=num_skills*2, value=-1.0)
       Y = pad_sequences(y, padding='post', maxlen = maxlen, dim=num_skills+1, value=-1.0)
        
       for t in xrange(0, maxlen, time_window):
           f(X[:,t:(t+time_window),:], Y[:,t:(t+time_window),:])
           
       processed += end_before - start_from
       
       # reset the states for the next batch of sequences
       if batch_done:
           batch_done((processed * 100.0) / len(seqs))
        
def round_to_multiple(x, base):
    return int(base * math.ceil(float(x)/base))

# https://groups.google.com/forum/#!msg/keras-users/7sw0kvhDqCw/QmDMX952tq8J
def pad_sequences(sequences, maxlen=None, dim=1, dtype='int32',
    padding='pre', truncating='pre', value=0.):
    '''
        Override keras method to allow multiple feature dimensions.
        @dim: input feature dimension (number of features per timestep)
    '''
    lengths = [len(s) for s in sequences]

    nb_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    x = (np.ones((nb_samples, maxlen, dim)) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError("Truncating type '%s' not understood" % padding)

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError("Padding type '%s' not understood" % padding)
    return x


In [5]:
main()

<type 'list'>




seqs
3
0.693065
(100.000 %) 0.683660
seqs
2
==== Epoch: 0, Test AUC: 0.650000
seqs
3
0.682026
(100.000 %) 0.665668
seqs
2
==== Epoch: 1, Test AUC: 0.700000
seqs
3
0.674965
(100.000 %) 0.651343
seqs
2
==== Epoch: 2, Test AUC: 0.690000
seqs
3
0.66871
(100.000 %) 0.637414
seqs
2
==== Epoch: 3, Test AUC: 0.690000
seqs
3
0.662744
(100.000 %) 0.623193
seqs
2
==== Epoch: 4, Test AUC: 0.690000
seqs
3
0.656918
(100.000 %) 0.608650
seqs
2
==== Epoch: 5, Test AUC: 0.690000
seqs
3
0.651236
(100.000 %) 0.594200
seqs
2
==== Epoch: 6, Test AUC: 0.690000
seqs
3
0.645803
(100.000 %) 0.580488
seqs
2
==== Epoch: 7, Test AUC: 0.690000
seqs
3
0.640785
(100.000 %) 0.567847
seqs
2
==== Epoch: 8, Test AUC: 0.690000
seqs
3
0.636309
(100.000 %) 0.555735
seqs
2
==== Epoch: 9, Test AUC: 0.690000
seqs
3
0.632337
(100.000 %) 0.543114
seqs
2
==== Epoch: 10, Test AUC: 0.690000
seqs
3
0.628699
(100.000 %) 0.529307
seqs
2
==== Epoch: 11, Test AUC: 0.690000
seqs
3
0.625226
(100.000 %) 0.514128
seqs
2
==== Epoch: 12, Tes