In [22]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


Here we initialize the parameters for showing "better" plots

In [23]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from skimage.util import montage
from IPython.display import Image, display, SVG, clear_output, HTML
plt.rcParams["figure.figsize"] = (6, 6)
plt.rcParams["figure.dpi"] = 125
plt.rcParams["font.size"] = 14
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['DejaVu Sans']
plt.style.use('ggplot')
sns.set_style("whitegrid", {'axes.grid': False})
plt.rcParams['image.cmap'] = 'gray' # grayscale looks better
import networkx as nx
def draw_graph_mpl(g, pos=None, ax=None, layout_func=nx.drawing.layout.kamada_kawai_layout, draw_labels=True):
    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=(20, 20))
    else:
        fig = None
    if pos is None:
        pos = layout_func(g)
    node_color = []
    node_labels = {}
    shift_pos = {}
    for k in g:
        node_color.append(g.nodes[k].get('color', 'green'))
        node_labels[k] = g.nodes[k].get('label', k)
        shift_pos[k] = [pos[k][0], pos[k][1]]
    
    edge_color = []
    edge_width = []
    for e in g.edges():
        edge_color.append(g.edges[e].get('color', 'black'))
        edge_width.append(g.edges[e].get('width', 0.5))
    nx.draw_networkx_edges(g, pos, style='--', edge_color=edge_color, width=edge_width, alpha=0.5, ax=ax)
    nx.draw_networkx_nodes(g, pos, node_color=node_color, node_shape='p', node_size=300, alpha=0.75, ax=ax)
    if draw_labels:
        nx.draw_networkx_labels(g, shift_pos, labels=node_labels, ax=ax)
    ax.autoscale()
    return fig, ax, pos

In [1]:
import numpy as np
import pandas as pd

from keras import Input, Model
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Flatten
from keras.optimizers import Adam
from keras.regularizers import l2

from spektral.datasets import mnist
from spektral.utils.sparse import sp_matrix_to_sp_tensor
from spektral.utils import normalized_laplacian
from spektral.data import MixedLoader, BatchLoader
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import sparse_categorical_accuracy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

from spektral.data import MixedLoader
from spektral.datasets.mnist import MNIST
from spektral.layers import GATConv, GCNConv, GeneralConv, GlobalSumPool
from spektral.utils.sparse import sp_matrix_to_sp_tensor

# Parameters
batch_size = 50  # Batch size
epochs = 1000  # Number of training epochs
patience = 10  # Patience for early stopping
l2_reg = 5e-4  # Regularization rate for l2



In [2]:
data = MNIST()
adj=data.a

# Create filter for GCN and convert to sparse tensor.
data.a = GeneralConv.preprocess(data.a)
data.a = sp_matrix_to_sp_tensor(data.a)

# Train/valid/test split
data_tr, data_te = data[:-10000], data[-10000:]
data_tr, data_va = data_tr[:-10000], data_tr[-10000:]

# We use a MixedLoader since the dataset is in mixed mode
loader_tr = MixedLoader(data_tr, batch_size=batch_size, epochs=epochs)
loader_va = MixedLoader(data_va, batch_size=batch_size)
loader_te = MixedLoader(data_te, batch_size=batch_size)

# Specify device to be used by TensorFlow
device = '/device:GPU:0'

# Build model
class Net(Model):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.conv1 = GeneralConv(20, activation="elu", kernel_regularizer=l2(l2_reg), use_bias=True)
        self.conv2 = GeneralConv(20, activation="elu", kernel_regularizer=l2(l2_reg), use_bias=True)
        self.flatten = Flatten()
        self.fc1 = Dense(256, activation="relu")
        self.fc2 = Dense(10, activation="softmax")  # MNIST has 10 classes

    def call(self, inputs):
        x, a = inputs
        with tf.device(device):
            x = self.conv1([x, a])
            x = self.conv2([x, a])
            output = self.flatten(x)
            output = self.fc1(output)
            output = self.fc2(output)

        return output

# Create model
with tf.device(device):
    model = Net()
    optimizer = Adam()
    loss_fn = SparseCategoricalCrossentropy()
    fltr = normalized_laplacian(adj)

# Training function
@tf.function
def train_on_batch(inputs, target):
    with tf.device(device):
        with tf.GradientTape() as tape:
            predictions = model(inputs, training=True)
            loss = loss_fn(target, predictions) + sum(model.losses)
            acc = tf.reduce_mean(sparse_categorical_accuracy(target, predictions))

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        return loss, acc

# Evaluation function
def evaluate(loader):
    step = 0
    results = []
    for batch in loader:
        step += 1
        inputs,target = batch
        
        with tf.device(device):
            predictions = model(inputs, training=False)
            loss = loss_fn(target, predictions)
            acc = tf.reduce_mean(sparse_categorical_accuracy(target, predictions))
            results.append((loss, acc, len(target)))  # Keep track of batch size
            if step == loader.steps_per_epoch:
                results = np.array(results)
                return np.average(results[:, :-1], 0, weights=results[:, -1])
            


# Setup training
best_val_loss = 10000
current_patience = patience


We note that there is a strong bottleneck in training time given by the Dataset structure that is slow to open, each epoch therefore can not take less time then what is needed to loop over the loader_tr dataset for one epoch: aka 17sec on this device


In [17]:
import time
start_time=time.time()
step=0
for batch in loader_tr:
    step += 1
    if step==50000:
        break
print(step, time.time()-start_time)  


50000 17.80702519416809


By comparison, if we had to access only x_train, that are the numpy arrays containing the values of each pixel the time elapsed is close to 0. Thus the datastructure is clearly not efficient. I am going to check for similar structures on pytorch how the efficiency performs.

In [24]:
import time
start_time=time.time()
step=0
for x in data_tr:
    step += 1
    a=x.x
    b=x.y
    c=x.a
    
    if step==50000:
        break
print(step, time.time()-start_time)  


50000 0.07930469512939453


In [25]:

# Training loop
results_tr = []
step=0
import time
start_time=time.time()
for batch in loader_tr:
    step += 1
  
    # Training step
    inputs, target = batch

    loss, acc = train_on_batch(inputs, target)
    results_tr.append((loss, acc, len(target)))
    
    if step == loader_tr.steps_per_epoch:
        results_va = evaluate(loader_va)
        if results_va[0] < best_val_loss:
            best_val_loss = results_va[0]
            current_patience = patience
            results_te = evaluate(loader_te)
        else:
            current_patience -= 1
            if current_patience == 0:
                print("Early stopping")
                break

        # Print results
        results_tr = np.array(results_tr)
        results_tr = np.average(results_tr[:, :-1], 0, weights=results_tr[:, -1])
        elapsed_time = time.time() - start_time

        print(
            "Train loss: {:.4f}, acc: {:.4f} | "
            "Valid loss: {:.4f}, acc: {:.4f} | "
            "Test loss: {:.4f}, acc: {:.4f} | "
            "Elapsed time: {:.2f}s".format(
                *results_tr, *results_va, *results_te, elapsed_time
            )
        )

        # Reset epoch
        results_tr = []
        step = 0

Train loss: 1.3263, acc: 0.8488 | Valid loss: 0.3932, acc: 0.8930 | Test loss: 0.3919, acc: 0.8908 | Elapsed time: 16.45s
Train loss: 0.3316, acc: 0.9091 | Valid loss: 0.3551, acc: 0.9019 | Test loss: 0.3388, acc: 0.9008 | Elapsed time: 29.60s
Train loss: 0.2968, acc: 0.9190 | Valid loss: 0.4556, acc: 0.8612 | Test loss: 0.3388, acc: 0.9008 | Elapsed time: 40.28s
Train loss: 0.2502, acc: 0.9279 | Valid loss: 0.6306, acc: 0.8419 | Test loss: 0.3388, acc: 0.9008 | Elapsed time: 51.08s
Train loss: 0.2449, acc: 0.9317 | Valid loss: 0.3061, acc: 0.9130 | Test loss: 0.3145, acc: 0.9134 | Elapsed time: 64.42s
Train loss: 0.2386, acc: 0.9324 | Valid loss: 1.1207, acc: 0.7823 | Test loss: 0.3145, acc: 0.9134 | Elapsed time: 75.31s
Train loss: 0.2254, acc: 0.9376 | Valid loss: 0.2274, acc: 0.9296 | Test loss: 0.2299, acc: 0.9318 | Elapsed time: 88.69s
Train loss: 0.2129, acc: 0.9414 | Valid loss: 0.2729, acc: 0.9224 | Test loss: 0.2299, acc: 0.9318 | Elapsed time: 99.36s
Train loss: 0.2097, acc:

Comments: It is unclear to me why the accuracy of the validation oscillates so wildly while the accuracy of test increases steadily. I would expect them to be correlated, and in general I don't understand why the validation loss can drop so low when the GNN is labelling so precisely the other dataset values.
Having said that it seems that test accuracy is quite high, or at least acceptable I would say. This comes as no surprise, as the flatten layer brings 4mln parameters roughly (compared to 12 milions of the notebook that ispired this layer structure, it is still a bit better). Still need a fix the model.summary, because it is not possible to see the output shape of each layer. This is due to the way 


In [27]:
model.summary(print_fn=None,
    expand_nested=True,
    show_trainable=True,
    layer_range=None)

Model: "net"
____________________________________________________________________________
 Layer (type)                Output Shape              Param #   Trainable  
 general_conv (GeneralConv)  multiple                  120       Y          
                                                                            
 general_conv_1 (GeneralConv  multiple                 500       Y          
 )                                                                          
                                                                            
 flatten (Flatten)           multiple                  0         Y          
                                                                            
 dense (Dense)               multiple                  4014336   Y          
                                                                            
 dense_1 (Dense)             multiple                  2570      Y          
                                                               

In [80]:
from spektral.layers import GATConv, GCNConv, GeneralConv, GlobalSumPool, GlobalAvgPool


In the following we substitute GeneralConv with  GCNConv. This implies that also flatten has to be abbandoned in favour of GlobalSumPool or GlobalAvgPool, or in general a pooling layer. Still not clear though why this happens, I will pose the question on stackoverflow or will ask in some discord channel.

In [81]:
data = MNIST()
adj=data.a

# Create filter for GCN and convert to sparse tensor.
data.a = GeneralConv.preprocess(data.a)
data.a = sp_matrix_to_sp_tensor(data.a)

# Train/valid/test split
data_tr, data_te = data[:-10000], data[-10000:]
data_tr, data_va = data_tr[:-10000], data_tr[-10000:]

# We use a MixedLoader since the dataset is in mixed mode
loader_tr = MixedLoader(data_tr, batch_size=batch_size, epochs=epochs)
loader_va = MixedLoader(data_va, batch_size=batch_size)
loader_te = MixedLoader(data_te, batch_size=batch_size)

# Specify device to be used by TensorFlow
device = '/device:GPU:0'

# Build model
class Net(Model):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.conv1 = GCNConv(32, activation="elu", kernel_regularizer=l2(l2_reg), use_bias=True)
        self.conv2 = GCNConv(32, activation="elu", kernel_regularizer=l2(l2_reg), use_bias=True)
        self.conv3 = GCNConv(32, activation="elu", kernel_regularizer=l2(l2_reg), use_bias=True)
        self.flatten = GlobalAvgPool()
        self.fc1 = Dense(512, activation="relu")
        self.fc2 = Dense(10, activation="softmax")  # MNIST has 10 classes

    def call(self, inputs):
        x, a = inputs
        with tf.device(device):
            x = self.conv1([x, a])
            x = self.conv2([x, a])
            output = self.flatten(x)
            output = self.fc1(output)
            output = self.fc2(output)

        return output

# Create model
with tf.device(device):
    model = Net()
    optimizer = Adam()
    loss_fn = SparseCategoricalCrossentropy()
    

# Training function
@tf.function
def train_on_batch(inputs, target):
    with tf.device(device):
        with tf.GradientTape() as tape:

            predictions = model(inputs, training=True)
            loss = loss_fn(target, predictions) + sum(model.losses)
            acc = tf.reduce_mean(sparse_categorical_accuracy(target, predictions))

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        return loss, acc

# Evaluation function
def evaluate(loader):
    step = 0
    results = []
    for batch in loader:
        step += 1
        inputs,target = batch
        
        with tf.device(device):
            predictions = model(inputs, training=False)
            loss = loss_fn(target, predictions)
            acc = tf.reduce_mean(sparse_categorical_accuracy(target, predictions))
            results.append((loss, acc, len(target)))  # Keep track of batch size
            if step == loader.steps_per_epoch:
                results = np.array(results)
                return np.average(results[:, :-1], 0, weights=results[:, -1])
            


# Setup training
best_val_loss = 10000
current_patience = patience


# Training loop
results_tr = []
step=0
import time
start_time=time.time()
for batch in loader_tr:
    step += 1
  
    # Training step
    inputs, target = batch

    loss, acc = train_on_batch(inputs, target)
    results_tr.append((loss, acc, len(target)))
    
    if step == loader_tr.steps_per_epoch:
        results_va = evaluate(loader_va)
        if results_va[0] < best_val_loss:
            best_val_loss = results_va[0]
            current_patience = patience
            results_te = evaluate(loader_te)
        else:
            current_patience -= 1
            if current_patience == 0:
                print("Early stopping")
                break

        # Print results
        results_tr = np.array(results_tr)
        results_tr = np.average(results_tr[:, :-1], 0, weights=results_tr[:, -1])
        elapsed_time = time.time() - start_time

        print(
            "Train loss: {:.4f}, acc: {:.4f} | "
            "Valid loss: {:.4f}, acc: {:.4f} | "
            "Test loss: {:.4f}, acc: {:.4f} | "
            "Elapsed time: {:.2f}s".format(
                *results_tr, *results_va, *results_te, elapsed_time
            )
        )

        # Reset epoch
        results_tr = []
        step = 0

  np.random.shuffle(a)


(<tf.Tensor 'inputs:0' shape=(50, 784, 1) dtype=float64>, <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x00000168D25817C0>) Tensor("target:0", shape=(50,), dtype=uint8)
(<tf.Tensor 'inputs:0' shape=(50, 784, 1) dtype=float64>, <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x00000168D1C13490>) Tensor("target:0", shape=(50,), dtype=uint8)
Train loss: 1.8824, acc: 0.2907 | Valid loss: 1.7656, acc: 0.3269 | Test loss: 1.7564, acc: 0.3290 | Elapsed time: 14.05s
Train loss: 1.7414, acc: 0.3539 | Valid loss: 1.6225, acc: 0.3813 | Test loss: 1.6232, acc: 0.3806 | Elapsed time: 28.19s
Train loss: 1.6071, acc: 0.4077 | Valid loss: 1.5323, acc: 0.4304 | Test loss: 1.5170, acc: 0.4379 | Elapsed time: 40.61s
Train loss: 1.5463, acc: 0.4317 | Valid loss: 1.4696, acc: 0.4662 | Test loss: 1.4613, acc: 0.4622 | Elapsed time: 53.96s
Train loss: 1.5185, acc: 0.4433 | Valid loss: 1.4430, acc: 0.4748 | Test loss: 1.4467, acc: 0.4693 | Elapsed time: 67.02s
Train loss

Train loss: 1.2490, acc: 0.5491 | Valid loss: 1.2094, acc: 0.5564 | Test loss: 1.2014, acc: 0.5641 | Elapsed time: 691.53s
Train loss: 1.2487, acc: 0.5478 | Valid loss: 1.1827, acc: 0.5705 | Test loss: 1.2014, acc: 0.5641 | Elapsed time: 699.82s
Train loss: 1.2461, acc: 0.5490 | Valid loss: 1.2128, acc: 0.5449 | Test loss: 1.2014, acc: 0.5641 | Elapsed time: 708.15s
Train loss: 1.2533, acc: 0.5483 | Valid loss: 1.1908, acc: 0.5664 | Test loss: 1.2014, acc: 0.5641 | Elapsed time: 716.86s
Train loss: 1.2420, acc: 0.5510 | Valid loss: 1.1509, acc: 0.5769 | Test loss: 1.1738, acc: 0.5696 | Elapsed time: 729.34s
Train loss: 1.2448, acc: 0.5507 | Valid loss: 1.1750, acc: 0.5685 | Test loss: 1.1738, acc: 0.5696 | Elapsed time: 737.96s
Train loss: 1.2435, acc: 0.5533 | Valid loss: 1.1585, acc: 0.5731 | Test loss: 1.1738, acc: 0.5696 | Elapsed time: 746.89s
Train loss: 1.2350, acc: 0.5552 | Valid loss: 1.2056, acc: 0.5607 | Test loss: 1.1738, acc: 0.5696 | Elapsed time: 755.66s
Train loss: 1.22

Train loss: 1.1436, acc: 0.5912 | Valid loss: 1.3294, acc: 0.5128 | Test loss: 1.1012, acc: 0.5961 | Elapsed time: 1300.03s
Early stopping


Comments: The accuracy here stabilized close to 60%. This is most likely a problem in network expressivity, the absence of the flatten layer reduces drastically the number of parameters involved.

In [None]:
model.summary(print_fn=None,
    expand_nested=True,
    show_trainable=True,
    layer_range=None)