# Installing dependencies

In [1]:
import sys

!{sys.executable} -m pip install -U dm-sonnet==1.23
!{sys.executable} -m pip install --upgrade tfp-nightly
!{sys.executable} -m pip install tf-nightly
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas

Requirement already up-to-date: dm-sonnet==1.23 in /Users/raouldias/anaconda3/lib/python3.7/site-packages (1.23)
You should consider upgrading via the '/Users/raouldias/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
Requirement already up-to-date: tfp-nightly in /Users/raouldias/anaconda3/lib/python3.7/site-packages (0.12.0.dev20200819)
You should consider upgrading via the '/Users/raouldias/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/raouldias/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/raouldias/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


You should consider upgrading via the '/Users/raouldias/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [20]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


# Imports

In [21]:
import tensorflow as tf
import tensorflow_probability as tfp
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import datetime

from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, Input
from tensorflow_probability import distributions as tfd

# Loading Dataset

In [22]:
DATAPATH = '/Users/raouldias/Desktop/Masters Project/Datasets/brenda_hierarchical.csv'
dataframe = pd.read_csv(DATAPATH, delimiter=',', header=None, skiprows=1, names= ['ec_num', 'organism', 'km_value','substrate', 'comments'])

# Pre-Processing

In [23]:
dataframe = dataframe.drop(columns= ['comments'])
dataframe.drop_duplicates(keep='first', inplace= True)


# Splitting the data into training, testing and validation sets

In [24]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'training examples')
print(len(val), 'validation examples')
print(len(test), 'testing examples')

10889 training examples
2723 validation examples
3404 testing examples


# Seperating labels from data

In [25]:
train_x = train[['ec_num', 'organism', 'substrate']].copy()
test_x = test[['ec_num', 'organism', 'substrate']].copy()
train_y = train['km_value'].copy()
test_y = test['km_value'].copy()

# Functions to map categorical variables to integers

In [26]:
def categorify(df, cat_vars):
    categories = {}
    for cat in cat_vars:
        df[cat] = df[cat].astype("category").cat.as_ordered()
        categories[cat] = df[cat].cat.categories
    return categories

def apply_test(test,categories):
    for cat, index in categories.items():
        test[cat] = pd.Categorical(test[cat],categories=categories[cat],ordered=True)

# Function to produce optimal embedding layer size

In [27]:
def get_emb_sz(cat_col,categories_dict):
    num_classes = len(categories_dict[cat_col])
    return int(min(600,round(1.6*num_classes**0.56)))

# Function to unpack output of Mixture Density Network

In [28]:
def slice_parameter_vectors(parameter_vector):
    """ Returns an unpacked list of paramter vectors.
    """
    print(parameter_vector)
    alpha = parameter_vector[:, :, :c]
    mu = parameter_vector[:, :, c:c*2]
    sigma = parameter_vector[:, :, c*2:c*3]
    
    return alpha,mu,sigma

# Custom loss function to compute the negative log-likelihood of y given the mixture parameters

In [29]:
def gnll_loss(y, parameter_vector):
    """ Computes the mean negative log-likelihood loss of y given the mixture parameters.
    """
    alpha, mu, sigma = slice_parameter_vectors(parameter_vector) #Unpack parameter vectors
    gm = tfd.MixtureSameFamily(
        mixture_distribution=tfd.Categorical(probs=alpha),
        components_distribution=tfd.Normal(
            loc=mu,       
            scale=sigma))
    
    log_likelihood = gm.log_prob(tf.transpose(y))                 # Evaluate log-probability of y
    
    return -tf.reduce_mean(log_likelihood, axis=-1) 

# Non-negative Exponential Linear Unit  activation function

In [30]:
def nnelu(input):
    """ Computes the Non-Negative Exponential Linear Unit
    """
    return tf.add(tf.constant(1, dtype=tf.float32), tf.nn.elu(input))

tf.keras.utils.get_custom_objects().update({'nnelu': tf.keras.layers.Activation(nnelu)})

# Coverting the categorical variables to integers

In [31]:
cat_vars = ['ec_num', 'organism', 'substrate']
categories = categorify(train_x, cat_vars)
apply_test(test_x,categories)
for cat in cat_vars:
    train_x[cat] = train_x[cat].cat.codes+1
    test_x[cat] = test_x[cat].cat.codes+1

# Construction of Mixture Density Network

In [37]:
c = 4
no_parameters = 3
num_ec = len(dataframe.ec_num.unique())
num_organism = len(dataframe.organism.unique())
num_substrate = len(dataframe.substrate.unique())

ec_input = Input(shape= (1,),name = 'ec')
organism_input = Input(shape= (1,), name = 'organism')
substrate_input = Input(shape= (1,), name = 'substrate')

ec_features = layers.Embedding(num_ec, get_emb_sz('ec_num', categories))(ec_input)
organism_features = layers.Embedding(num_organism ,get_emb_sz('organism', categories))(organism_input)
substrate_features = layers.Embedding(num_substrate,get_emb_sz('substrate', categories))(substrate_input)

x = layers.concatenate([ec_features, organism_features, substrate_features])

h1 = layers.Dense(100, activation='relu', name='h1')(x)
h2 = layers.Dense(100, activation='relu', name='h2')(h1)

alphas = layers.Dense(c, activation="softmax", name="alphas")(h2)   # Create vector for alpha (softmax constrained)
mus = layers.Dense(c, name="mus")(h2)                               # Create vector for mus
sigmas = layers.Dense(c, activation="nnelu", name="sigmas")(h2)     # Create vector sigmas (nnelu constrained)
pvector = layers.concatenate([alphas, mus, sigmas], name='output') 

model = tf.keras.Model([ec_input, organism_input, substrate_input], pvector)

model.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
ec (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
organism (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
substrate (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 1, 69)        61617       ec[0][0]                         
_______________________________________________________________________________________

In [38]:
model.compile(loss=gnll_loss, optimizer = "Adam")

# Preparing input for model

In [39]:
input_dict = {'ec': train_x['ec_num'].values,
             'organism': train_x['organism'].values,
             'substrate': train_x['substrate'].values}

# Logging for tensorboard

In [40]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=0)

In [41]:
model.fit(input_dict, train_y, epochs=500, callbacks=[tensorboard_callback])

Epoch 1/500
Tensor("functional_5/output/concat:0", shape=(None, 1, 12), dtype=float32)
Tensor("functional_5/output/concat:0", shape=(None, 1, 12), dtype=float32)
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
E

Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500
Epoch 137/500
Epoch 138/500
Epoch 139/500
Epoch 140/500
Epoch 141/500
Epoch 142/500
Epoch 143/500
Epoch 144/500
Epoch 145/500
Epoch 146/500
Epoch 147/500
Epoch 148/500
Epoch 149/500
Epoch 150/500
Epoch 151/500
Epoch 152/500
Epoch 153/500
Epoch 154/500
Epoch 155/500
Epoch 156/500
Epoch 157/500
Epoch 158/500
Epoch 159/500
Epoch 160/500
Epoch 161/500
Epoch 162/500
Epoch 163/500
Epoch 164/500
Epoch 165/500
Epoch 166/500
Epoch 167/

Epoch 196/500
Epoch 197/500
Epoch 198/500
Epoch 199/500
Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 208/500
Epoch 209/500
Epoch 210/500
Epoch 211/500
Epoch 212/500
Epoch 213/500
Epoch 214/500
Epoch 215/500
Epoch 216/500
Epoch 217/500
Epoch 218/500
Epoch 219/500
Epoch 220/500
Epoch 221/500
Epoch 222/500
Epoch 223/500
Epoch 224/500
Epoch 225/500
Epoch 226/500
Epoch 227/500
Epoch 228/500
Epoch 229/500
Epoch 230/500
Epoch 231/500
Epoch 232/500
Epoch 233/500
Epoch 234/500
Epoch 235/500
Epoch 236/500
Epoch 237/500
Epoch 238/500
Epoch 239/500
Epoch 240/500
Epoch 241/500
Epoch 242/500
Epoch 243/500
Epoch 244/500
Epoch 245/500
Epoch 246/500
Epoch 247/500
Epoch 248/500

KeyboardInterrupt: 

In [19]:
%tensorboard --logdir logs/fit