In [38]:
import pandas as pd
import numpy as np
import scipy
import time
import random
import sys
# Import T. Kipf's GCN implementation
# https://github.com/tkipf/keras-gcn
sys.path.append('./keras-gcn/')
from keras.layers import Input, Dropout
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.utils import to_categorical

from kegra.layers.graph import GraphConvolution
from kegra.utils import *
%matplotlib inline

In [39]:
X = pd.read_hdf('history_small.hdf', key='hist') 
A = pd.read_pickle('adjacency_small.pkl') 
A = A[sorted(A.columns)]

### One student and all the graph

In [40]:
student = 142954.0
X_k_ = X[student]
X_k, ex_k = X_k_[X_k_!=-100], X_k_[X_k_!=-100].index

In [41]:
#y = pd.get_dummies(X_k_).as_matrix() #-100 0 1
y = pd.get_dummies(X_k_)[[0,1]].as_matrix() #0 1
A = scipy.sparse.csr_matrix(A.values)
X = np.asmatrix(X_k_.to_frame().as_matrix())

In [42]:
print(A.shape, X.shape, y.shape)
print(type(A), type(X), type(y))

(878, 878) (878, 1) (878, 2)
<class 'scipy.sparse.csr.csr_matrix'> <class 'numpy.matrixlib.defmatrix.matrix'> <class 'numpy.ndarray'>


In [43]:
ind, _ = np.where(X!=-100)
ind

array([327, 328, 330, 333, 338, 340, 347, 349, 352, 355, 358, 364, 371,
       372, 380, 381, 385, 387, 436, 442])

In [44]:
def get_splits(y):
    idx_train = sorted(random.sample(list(ind), 15))
    idx_val = sorted(random.sample([k for k in ind if k not in idx_train],5))
    idx_test = sorted(random.sample([k for k in range(878) if k not in ind],100))
    y_train = np.zeros(y.shape, dtype=np.int32)
    y_val = np.zeros(y.shape, dtype=np.int32)
    y_test = np.zeros(y.shape, dtype=np.int32)
    y_train[idx_train] = y[idx_train]
    y_val[idx_val] = y[idx_val]
    y_test[idx_test] = y[idx_test]
    train_mask = sample_mask(idx_train, y.shape[0])
    return y_train, y_val, y_test, idx_train, idx_val, idx_test, train_mask

In [45]:
y_train, y_val, y_test, idx_train, idx_val, idx_test, train_mask = get_splits(y)

In [46]:
# Normalize X ??

In [47]:
# Define parameters
DATASET = 'cora'
FILTER = 'localpool'  # 'chebyshev'
MAX_DEGREE = 2  # maximum polynomial degree
SYM_NORM = True  # symmetric (True) vs. left-only (False) normalization
NB_EPOCH = 200
PATIENCE = 10  # early stopping patience

In [48]:
A_ = preprocess_adj(A, SYM_NORM)
support = 1
graph = [X, A_]
G = [Input(shape=(None, None), batch_shape=(None, None), sparse=True)]

In [55]:
X_in = Input(shape=(X.shape[1],))
H = Dropout(0.5)(X_in)
H = GraphConvolution(16, support, activation='relu', W_regularizer=l2(5e-4))([H]+G)
H = Dropout(0.5)(H)
Y = GraphConvolution(y.shape[1], support, activation='softmax')([H]+G)

In [56]:
# Compile model
model = Model(inputs=[X_in]+G, outputs=Y)
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01))

In [57]:
# Helper variables for main training loop
wait = 0
preds = None
best_val_loss = 99999

# Fit
for epoch in range(1, NB_EPOCH+1):

    # Log wall-clock time
    t = time.time()

    # Single training iteration (we mask nodes without labels for loss calculation)
    model.fit(graph, y_train, sample_weight=train_mask,
              batch_size=A.shape[0], epochs=1, shuffle=False, verbose=0)

    # Predict on full dataset
    preds = model.predict(graph, batch_size=A.shape[0])

    # Train / validation scores
    train_val_loss, train_val_acc = evaluate_preds(preds, [y_train, y_val],
                                                   [idx_train, idx_val])
    print("Epoch: {:04d}".format(epoch),
          "train_loss= {:.4f}".format(train_val_loss[0]),
          "train_acc= {:.4f}".format(train_val_acc[0]),
          "val_loss= {:.4f}".format(train_val_loss[1]),
          "val_acc= {:.4f}".format(train_val_acc[1]),
          "time= {:.4f}".format(time.time() - t))

    # Early stopping
    if train_val_loss[1] < best_val_loss:
        best_val_loss = train_val_loss[1]
        wait = 0
    else:
        if wait >= PATIENCE:
            print('Epoch {}: early stopping'.format(epoch))
            break
        wait += 1

# Testing
test_loss, test_acc = evaluate_preds(preds, [y_test], [idx_test])
print("Test set results:",
      "loss= {:.4f}".format(test_loss[0]),
      "accuracy= {:.4f}".format(test_acc[0]))

Epoch: 0001 train_loss= 16.2609 train_acc= 0.7333 val_loss= 11.9722 val_acc= 0.8000 time= 0.9538
Epoch: 0002 train_loss= 15.7679 train_acc= 0.7333 val_loss= 11.6092 val_acc= 0.8000 time= 0.0674
Epoch: 0003 train_loss= 15.2761 train_acc= 0.7333 val_loss= 11.2471 val_acc= 0.8000 time= 0.0872
Epoch: 0004 train_loss= 14.7866 train_acc= 0.7333 val_loss= 10.8867 val_acc= 0.8000 time= 0.0880
Epoch: 0005 train_loss= 14.3000 train_acc= 0.7333 val_loss= 10.5285 val_acc= 0.8000 time= 0.0843
Epoch: 0006 train_loss= 13.9094 train_acc= 0.7333 val_loss= 10.2409 val_acc= 0.8000 time= 0.0894
Epoch: 0007 train_loss= 13.4924 train_acc= 0.7333 val_loss= 9.9339 val_acc= 0.8000 time= 0.0901
Epoch: 0008 train_loss= 13.0622 train_acc= 0.7333 val_loss= 9.6171 val_acc= 0.8000 time= 0.1007
Epoch: 0009 train_loss= 12.6242 train_acc= 0.7333 val_loss= 9.2946 val_acc= 0.8000 time= 0.1129
Epoch: 0010 train_loss= 12.1737 train_acc= 0.7333 val_loss= 8.9630 val_acc= 0.8000 time= 0.1134
Epoch: 0011 train_loss= 11.7618 tr

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [58]:
[preds[i][1] for i in range(len(preds))]

[0.37097481,
 0.37660295,
 0.37587154,
 0.37080279,
 0.37766799,
 0.37366334,
 0.37290075,
 0.37310949,
 0.36749214,
 0.37451833,
 0.36692548,
 0.36714947,
 0.37222794,
 0.36967167,
 0.37276649,
 0.36733121,
 0.36696759,
 0.36808732,
 0.367964,
 0.3720597,
 0.3707681,
 0.37176546,
 0.37223953,
 0.36846694,
 0.36900148,
 0.36560354,
 0.36901215,
 0.36683512,
 0.3707839,
 0.37318113,
 0.37498513,
 0.37095124,
 0.37210292,
 0.37378004,
 0.36589602,
 0.37163067,
 0.36798376,
 0.37338459,
 0.38131407,
 0.36630675,
 0.36618635,
 0.36944836,
 0.36863381,
 0.36945993,
 0.37089714,
 0.36902812,
 0.36884251,
 0.36929286,
 0.36850244,
 0.36604825,
 0.36576456,
 0.36841658,
 0.36959633,
 0.37376618,
 0.36794689,
 0.37534907,
 0.37000507,
 0.36972141,
 0.37251389,
 0.36936039,
 0.36592695,
 0.36582875,
 0.37035659,
 0.36613145,
 0.3691898,
 0.36625984,
 0.36589381,
 0.37096214,
 0.36582479,
 0.36927199,
 0.36766285,
 0.36901879,
 0.36893088,
 0.3769539,
 0.376425,
 0.3770858,
 0.37792671,
 0.375743

(index 0 : -100 (NaN), index 1 : 0 (not valid), index 2 : 1 (valid))  
index 0 : 0 (not valid), index 1 : 1 (valid)

In [59]:
l = [np.argmax(preds[i]) for i in range(len(preds))]

In [60]:
list(set(l))

[0]

#### TESTS

In [None]:
student = 142954.0
X_k_ = X[student]
X_k, ex_k = X_k_[X_k_!=-100], X_k_[X_k_!=-100].index

In [None]:
n_samples = 15
full = 20

X_train = X_k.values.reshape(full,1)
y_train = X_k[:full].values.reshape(full,1)

A_train = A[A.index.isin(idx_full)][sorted(idx_full)]
A_train = scipy.sparse.csr_matrix(A_train.values)

train_mask = sample_mask(range(15), A_train.shape[0]).reshape(20,1)

In [None]:
A_train.shape, X_train.shape, y_train.shape, train_mask.shape

In [None]:
# Normalize X_train
X_train = X_train/X_train.sum()
X_train = X_train.reshape((len(X_train), 1))

A_ = preprocess_adj(A_train, True)
support = 1
graph = [X_train, A_]
G = [Input(shape=(None, None), batch_shape=(None, None), sparse=True)]
X_in = Input(shape=(X_train.shape[1],))

# Define model architecture
H = Dropout(0.5)(X_in)
H = GraphConvolution(16, support, activation='relu', W_regularizer=l2(5e-4))([H]+G)
H = Dropout(0.5)(H)
Y = GraphConvolution(y_train.shape[1], support, activation='softmax')([H]+G)

# Compile model
model = Model(inputs=[X_in]+G, outputs=Y)
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01))

In [None]:
h = model.fit(graph, y_train, #sample_weight=train_mask, 
          batch_size=A_train.shape[0], epochs=1, shuffle=False, verbose=0)

In [None]:
student = 142954.0 # np.random.choice(X.columns)
X_k_ = X[student]
X_k, ex_k = X_k_[X_k_!=-100], X_k_[X_k_!=-100].index
c = 15
assert c + 1< len(X_k)

In [None]:
def one_hot(x):
    if x == 0:
        return [1,0]
    elif x == 1:
        return [0,1]

In [None]:
n_samples = 15
idx_train = random.sample(list(ex_k), n_samples)
idx_test = random.sample([k for k in ex_k if k not in idx_train], 1)
idx_full = np.hstack((idx_train,idx_test))
X_train = X_k.loc[idx_full].values
#X_train = to_categorical(X_train)
#y_train = X_k.loc[idx_test]
#y_train = np.array(y_train).reshape((len(y_train), 1))
y_train = np.zeros((X_train.shape[0],2))
y_train[n_samples] = one_hot(X_train[n_samples])
A_train = A[A.index.isin(idx_full)][sorted(idx_full)]
A_train = scipy.sparse.csr_matrix(A_train.values)

In [None]:
A_train.shape, X_train.shape, y_train.shape

In [None]:
train_mask = np.array(list(map(bool, np.hstack((np.ones(n_samples), [0])))))

In [None]:
# Normalize X_train
X_train = X_train/X_train.sum()
X_train = X_train.reshape((len(X_train), 1))

In [None]:
A_ = preprocess_adj(A_train, True)
support = 1
graph = [X_train, A_]
G = [Input(shape=(None, None), batch_shape=(None, None), sparse=True)]

In [None]:
X_in = Input(shape=(X_train.shape[1],))

# Define model architecture
H = Dropout(0.5)(X_in)
H = GraphConvolution(16, support, activation='relu', W_regularizer=l2(5e-4))([H]+G)
H = Dropout(0.5)(H)
Y = GraphConvolution(y_train.shape[1], support, activation='softmax')([H]+G)

# Compile model
model = Model(inputs=[X_in]+G, outputs=Y)
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01))

In [None]:
h = model.fit(graph, y_train, sample_weight=train_mask,
          batch_size=A_train.shape[0], epochs=1, shuffle=False, verbose=0)

In [None]:
preds = model.predict(graph, batch_size=A_train.shape[0])

In [None]:
preds