In [26]:
# Import T. Kipf's GCN implementation
# https://github.com/tkipf/keras-gcn
import pandas as pd
import numpy as np
import scipy
import time
import random
import sys
import keras
sys.path.append('./keras-gcn/')
from keras.losses import mean_absolute_error
from keras.layers import Input, Dropout
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.utils import to_categorical
import keras.backend as K
import tensorflow as tf
from kegra.layers.graph import GraphConvolution
from kegra.utils import *
%matplotlib inline

In [27]:
X = pd.read_hdf('history_small.hdf', key='hist') 
A = pd.read_pickle('adjacency_small.pkl') 
A = A[sorted(A.columns)]

In [28]:
X.shape, A.shape

((878, 290), (878, 878))

## Data preprocess

In [29]:
X[X==1] = 10 #10 #valid
X[X==0] = 1 #not valid
X[X==-100] = 0 #missing
X = X.astype(float)
np.unique(X)

array([  0.,   1.,  10.])

In [30]:
y = X.values.flatten()
y[y==0]=np.nan

#### To find index of student that answered to most questions

In [31]:
for col in X:
    if (len(X[col][X[col]!=0])> 500):
        print(col)
        break

123482.0


In [32]:
np.where(X[123482]!=0)[0].shape

(657,)

In [33]:
np.where(X.columns==123482.0)

(array([80]),)

### output : flattened input

In [34]:
#y = X.values.ravel(order = 'F') #len(y) =n*m : 878*290 #rempli en colonne de sorte que 
                                        #per user_id, the responses to all the exercise_id (valid/not valid/missing)
#np.where(y!=0)[0].shape #8821 non null entries (1 or 10)
#y[y==0] = np.nan

In [35]:
l = []
for col in X : 
    a = pd.get_dummies(X[col])
    if (1 not in a.columns) & (10 in a.columns):
        a[1] = np.zeros(a.shape[0],dtype=int)
    if (1 in a.columns) & (10 not in a.columns):
        a[10] = np.zeros(a.shape[0],dtype=int)
    l.append(a[[1,10]])
X = pd.concat(l, axis=1, keys=X.columns)
X.columns = [str(int(col[0]))+'_'+str(col[1]) for col in X.columns.values]

In [36]:
A = scipy.sparse.csr_matrix(A.values)
X = np.asmatrix(X)

In [37]:
print(X.shape, A.shape, y.shape) #n*2m, n*m
print(type(X), type(A), type(y)) 

(878, 580) (878, 878) (254620,)
<class 'numpy.matrixlib.defmatrix.matrix'> <class 'scipy.sparse.csr.csr_matrix'> <class 'numpy.ndarray'>


## Training GCN

In [38]:
#def get_splits(y):
#    idx_train = range(878)
#    idx_val = range(878)#range(878, 1756)
#    idx_test = range(878)#range(1756, 2634)
#    y_train = np.zeros(len(idx_train), dtype=np.int32)
#    y_val = np.zeros(len(idx_val), dtype=np.int32)
#    y_test = np.zeros(len(idx_test), dtype=np.int32)
#    y_train = y[idx_train]
#    y_val = y[idx_val]
#    y_test = y[idx_test]
#    train_mask = np.array(np.ones(len(range(878))), dtype=np.bool)
#    return y_train, y_val, y_test, idx_train, idx_val, idx_test, train_mask

#y_train, y_val, y_test, idx_train, idx_val, idx_test, train_mask = get_splits(y)

In [39]:
FILTER = 'localpool'  
MAX_DEGREE = 2 
SYM_NORM = True  
NB_EPOCH = 200
PATIENCE = 10  

# Normalize X
X = X/X.sum(1).reshape(-1, 1)

A_ = preprocess_adj(A, SYM_NORM)
support = 1
graph = [X, A_]
G = [Input(shape=(None, None), batch_shape=(None, None), sparse=True)]

In [40]:
y_ = pd.get_dummies(y).as_matrix()

In [41]:
#np.where(y!=0)[0][878:1756].shape
#idx_train = np.where(y!=0)[0][:878]
#idx_val = np.where(y!=0)[0][878:1756].shape
#idx_test = np.where(y!=0)[0][1756:2634].shape

In [42]:
idx_train = range(70240,71118)
idx_val = range(70240,71118)
idx_test = range(70240,71118)

y_train = y_[idx_train]
y_test = y_[idx_test]
y_val = y_[idx_val]

#list_ind = np.where(y_train!=0)[0]
#graph_train = [graph[0][list_ind], graph[1][list_ind][:,list_ind]]

In [55]:
X_in = Input(shape=(X.shape[1],))
H = Dropout(0.5)(X_in)
H = GraphConvolution(16, support, activation='relu', W_regularizer=l2(5e-4))([H]+G)
H = Dropout(0.5)(H)
Y = GraphConvolution(y_.shape[1], support, activation='softmax')([H]+G)

In [56]:
#b = loss_(y_train, y_train)
#sess = tf.Session()
#type(b.eval(session=sess))

In [62]:
def loss_(y_true, y_pred):
    list_ind = np.where(y_train!=np.nan)[0]
    mask = np.zeros((y_train.shape[0],1))
    mask[list_ind] = 1
    #mask = keras.utils.to_categorical(mask)
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=y_pred, labels=y_true)
    mask = tf.cast(mask, dtype=tf.float32)
    mask /= tf.reduce_mean(mask)
    loss *= mask
    return tf.reduce_mean(loss)

In [63]:
# Compile model
model = Model(inputs=[X_in]+G, outputs=Y)
model.compile(loss=loss_, optimizer=Adam(lr=0.01))

In [64]:
#def accuracy(preds, labels):
#    return np.mean(np.equal(np.argmax(labels, 1), np.argmax(preds, 1)))

#def evaluate_preds(preds, labels, indices):

#    split_loss = list()
#    split_acc = list()

#    for y_split, idx_split in zip(labels, indices):
#        split_loss.append(categorical_crossentropy(preds[idx_split], y_split[idx_split]))
#        split_acc.append(accuracy(preds[idx_split], y_split[idx_split]))

#    return split_loss, split_acc

In [65]:
train_mask = sample_mask(np.where(y_train!=0)[0], y_train.shape[0])

In [66]:
wait = 0
preds = None
best_val_loss = 99999

# Fit
for epoch in range(1, NB_EPOCH+1):
    t = time.time()
    model.fit(graph, y_train, sample_weight=train_mask,
              batch_size=A.shape[0], epochs=1, shuffle=False, verbose=1)
    preds = model.predict(graph, batch_size=A.shape[0])
    
    # Train / validation scores
    #train_val_loss, train_val_acc = evaluate_preds(preds, [y_train, y_val],
    #                                               [idx_train, idx_val])
    #print("Epoch: {:04d}".format(epoch),
    #      "train_loss= {:.4f}".format(train_val_loss[0]),
    #      "train_acc= {:.4f}".format(train_val_acc[0]),
    #      "val_loss= {:.4f}".format(train_val_loss[1]),
    #      "val_acc= {:.4f}".format(train_val_acc[1]),
    #      "time= {:.4f}".format(time.time() - t))

    # Early stopping
    #if train_val_loss[1] < best_val_loss:
    #    best_val_loss = train_val_loss[1]
    #    wait = 0
    #else:
    #    if wait >= PATIENCE:
    #        print('Epoch {}: early stopping'.format(epoch))
    #        break
    #    wait += 1

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


KeyboardInterrupt: 

In [67]:
preds[:,1]

array([ 0.51252425,  0.51201069,  0.51208144,  0.51253653,  0.5118041 ,
        0.51229376,  0.51236653,  0.51231217,  0.5128516 ,  0.51212335,
        0.51289302,  0.51288599,  0.51230735,  0.51267272,  0.51229095,
        0.51287019,  0.51290262,  0.51276314,  0.51281542,  0.51243496,
        0.51257396,  0.51234871,  0.51234627,  0.51270264,  0.51264107,
        0.51303852,  0.51271826,  0.51289207,  0.51253009,  0.51228434,
        0.51204091,  0.51244682,  0.51232362,  0.51215893,  0.51301086,
        0.51243871,  0.51278168,  0.51219296,  0.51138586,  0.51297837,
        0.51298803,  0.51266664,  0.51276958,  0.5126183 ,  0.51244164,
        0.51271659,  0.51268536,  0.51269186,  0.51270908,  0.51299679,
        0.51302654,  0.51273096,  0.51258332,  0.51214987,  0.51280242,
        0.51203448,  0.51253247,  0.51257855,  0.51229656,  0.51260686,
        0.51299316,  0.51301652,  0.51249737,  0.51299036,  0.51263297,
        0.51297849,  0.51299918,  0.51243758,  0.51301086,  0.51

In [None]:
preds = model.predict(graph, batch_size=A.shape[0])

In [223]:
# Testing
test_loss, test_acc = evaluate_preds(preds, [y_test], [idx_test])
print("Test set results:",
      "loss= {:.4f}".format(test_loss[0]),
      "accuracy= {:.4f}".format(test_acc[0]))

Test set results: loss= 0.6414 accuracy= 0.0023
