In [1]:
import pandas as pd
import numpy as np
import scipy
import time
import random
import sys
# Import T. Kipf's GCN implementation
# https://github.com/tkipf/keras-gcn
sys.path.append('./keras-gcn/')
from keras.layers import Input, Dropout
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.utils import to_categorical
import keras.backend as K
import tensorflow as tf
from kegra.layers.graph import GraphConvolution
from kegra.utils import *
%matplotlib inline

Using TensorFlow backend.


In [2]:
X = pd.read_hdf('history_small.hdf', key='hist') 
A = pd.read_pickle('adjacency_small.pkl') 
A = A[sorted(A.columns)]

In [3]:
X.shape, A.shape

((878, 290), (878, 878))

## Data preprocess

In [4]:
X[X==1] = 10 #valid
X[X==0] = 1 #not valid
X[X==-100] = 0 #missing
np.unique(X)

array([ 0,  1, 10])

In [5]:
y = X.values.ravel(order = 'F') #len(y) =n*m : 878*290 #rempli en colonne de sorte que 
                                        #per user_id, the responses to all the exercise_id (valid/not valid/missing)
#np.where(y!=0)[0].shape #8821 non null entries (1 or 10)

In [6]:
l = []
for col in X : 
    a = pd.get_dummies(X[col])
    if (1 not in a.columns) & (10 in a.columns):
        a[1] = np.zeros(a.shape[0],dtype=int)
    if (1 in a.columns) & (10 not in a.columns):
        a[10] = np.zeros(a.shape[0],dtype=int)
    l.append(a[[1,10]])
X = pd.concat(l, axis=1, keys=X.columns)
X.columns = [str(int(col[0]))+'_'+str(col[1]) for col in X.columns.values]

In [7]:
A = scipy.sparse.csr_matrix(A.values)
X = np.asmatrix(X)

In [8]:
print(X.shape, A.shape, y.shape) #n*2m, n*m
print(type(X), type(A), type(y)) 

(878, 580) (878, 878) (254620,)
<class 'numpy.matrixlib.defmatrix.matrix'> <class 'scipy.sparse.csr.csr_matrix'> <class 'numpy.ndarray'>


## Training GCN

In [9]:
def sample_mask(idx, l):
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

def get_splits(y):
    idx_train = range(878)
    idx_val = range(878)#range(878, 1756)
    idx_test = range(878)#range(1756, 2634)
    y_train = np.zeros(len(idx_train), dtype=np.int32)
    y_val = np.zeros(len(idx_val), dtype=np.int32)
    y_test = np.zeros(len(idx_test), dtype=np.int32)
    y_train = y[idx_train]
    y_val = y[idx_val]
    y_test = y[idx_test]
    train_mask = np.array(np.ones(len(range(878))), dtype=np.bool)
    return y_train, y_val, y_test, idx_train, idx_val, idx_test, train_mask

In [10]:
y_train, y_val, y_test, idx_train, idx_val, idx_test, train_mask = get_splits(y)

In [11]:
FILTER = 'localpool'  
MAX_DEGREE = 2 
SYM_NORM = True  
NB_EPOCH = 200
PATIENCE = 10  

# Normalize X
X = X/(X.sum(1).reshape(-1, 1))

A_ = preprocess_adj(A, SYM_NORM)
support = 1
graph = [X, A_]
G = [Input(shape=(None, None), batch_shape=(None, None), sparse=True)]

In [12]:
X_in = Input(shape=(X.shape[1],))
H = Dropout(0.5)(X_in)
H = GraphConvolution(16, support, activation='relu', W_regularizer=l2(5e-4))([H]+G)
H = Dropout(0.5)(H)
Y = GraphConvolution(1, support, activation='softmax')([H]+G)

In [13]:
def loss_(y_true, y_pred):
    ix = tf.where(tf.not_equal(y_true, 0))
    true = tf.gather(y_true, ix)
    pred = tf.gather(y_pred, ix)
    return K.binary_crossentropy(true, pred)

In [14]:
# Compile model
model = Model(inputs=[X_in]+G, outputs=Y)
model.compile(loss=loss_, optimizer=Adam(lr=0.01))

In [None]:
wait = 0
preds = None
best_val_loss = 99999

# Fit
for epoch in range(1, NB_EPOCH+1):
    t = time.time()
    model.fit(graph, y_train, sample_weight=train_mask,
              batch_size=A.shape[0], epochs=1, shuffle=False, verbose=1)
    preds = model.predict(graph, batch_size=A.shape[0])
    
    # Train / validation scores
    train_val_loss, train_val_acc = evaluate_preds(preds, [y_train, y_val],
                                                   [idx_train, idx_val])
    print("Epoch: {:04d}".format(epoch),
          "train_loss= {:.4f}".format(train_val_loss[0]),
          "train_acc= {:.4f}".format(train_val_acc[0]),
          "val_loss= {:.4f}".format(train_val_loss[1]),
          "val_acc= {:.4f}".format(train_val_acc[1]),
          "time= {:.4f}".format(time.time() - t))

    # Early stopping
    if train_val_loss[1] < best_val_loss:
        best_val_loss = train_val_loss[1]
        wait = 0
    else:
        if wait >= PATIENCE:
            print('Epoch {}: early stopping'.format(epoch))
            break
        wait += 1

In [None]:
# Testing
test_loss, test_acc = evaluate_preds(preds, [y_test], [idx_test])
print("Test set results:",
      "loss= {:.4f}".format(test_loss[0]),
      "accuracy= {:.4f}".format(test_acc[0]))