# Node-embedding neural network for NCI (National Cancer Institute)

In [1]:
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE

import os
import networkx as nx
import numpy as np
import pandas as pd
from tensorflow import keras

from stellargraph import StellarGraph
from stellargraph.data import BiasedRandomWalk
from stellargraph.data import UnsupervisedSampler
from stellargraph.data import BiasedRandomWalk
from stellargraph.mapper import Node2VecLinkGenerator, Node2VecNodeGenerator
from stellargraph.layer import Node2Vec, link_classification

from stellargraph import datasets
from IPython.display import display, HTML

%matplotlib inline

### Dataset and Objective

- The dataset is NCI ontology, contains parent nodes and its relationship content.

- The objective of the node-embedding neural network is to create node to vector model that can be used for:

    - link prediction

In [2]:
with open ("NCI.txt", "r") as myfile:
    NCI = myfile.readlines()
for i in range(len(NCI)):
    NCI[i] = NCI[i].split()
    
NCI

[['545', '545'],
 ['5239604', '5239521'],
 ['5239604', '5237887'],
 ['5239604', '5237886'],
 ['5239604', '5237885'],
 ['5239604', '5237884'],
 ['5239604', '5237883'],
 ['5239604', '5237882'],
 ['5239604', '5237881'],
 ['5239604', '5237880'],
 ['5239604', '5237879'],
 ['5239604', '5237878'],
 ['5239604', '5237877'],
 ['5239604', '5237876'],
 ['5239604', '5237875'],
 ['5239604', '5237874'],
 ['5239604', '5237873'],
 ['5239604', '5237872'],
 ['5239604', '5237871'],
 ['5239604', '5237870'],
 ['5239604', '5237869'],
 ['5239604', '5237868'],
 ['5239604', '5237867'],
 ['5239604', '5237866'],
 ['5239604', '5237865'],
 ['5239604', '5237864'],
 ['5239604', '5237863'],
 ['5239604', '5237862'],
 ['5239604', '5237861'],
 ['5239604', '5237860'],
 ['5239604', '5237859'],
 ['5239604', '5237858'],
 ['5239604', '5237857'],
 ['5239604', '5237856'],
 ['5239604', '5237855'],
 ['5239604', '5237854'],
 ['5239604', '5237853'],
 ['5239604', '5237852'],
 ['5239604', '5237851'],
 ['5239604', '5237850'],
 ['52396

In [3]:
data = pd.DataFrame(NCI, columns=['source','target','drop'])

data.drop('drop',axis='columns', inplace=True)
data.dropna()
data

Unnamed: 0,source,target
0,545,545
1,5239604,5239521
2,5239604,5237887
3,5239604,5237886
4,5239604,5237885
...,...,...
170665,5239592,2698128
170666,5239592,2346798
170667,5239592,2986404
170668,5239592,3247294


In [4]:
is_NaN = data.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = data[row_has_NaN]

rows_with_NaN

Unnamed: 0,source,target
89665,1552622.0,
93986,,
106763,,
115862,,
130914,2984040.0,
130931,3641637.0,
132537,5204241.0,
143183,3831043.0,
149893,2986410.0,


In [5]:
data = data.drop(labels=[89665,93986,106763,115862,130914,130931,132537,143183,149893], axis=0)

In [6]:
is_NaN = data.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = data[row_has_NaN]

rows_with_NaN

Unnamed: 0,source,target


In [69]:
G  = StellarGraph(edges=data)

print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 151065, Edges: 170661

 Node types:
  default: [151065]
    Features: none
    Edge types: default-default->default

 Edge types:
    default-default->default: [170661]
        Weights: all 1 (default)
        Features: none


In [8]:
walk_number = 50
walk_length = 2
node = data['source']

walker = BiasedRandomWalk(
    G,
    n=walk_number,
    length=walk_length,
    p=0.5,  # defines probability, 1/p, of returning to source node
    q=2.0,  # defines probability, 1/q, for moving to a node away from the source node
)

unsupervised_samples = UnsupervisedSampler(G, nodes=node, walker=walker)

In [27]:
batch_size = 10000
epochs = 1
emb_size = 512

generator = Node2VecLinkGenerator(G, batch_size)
node2vec = Node2Vec(emb_size, generator=generator)

In [23]:
x_inp, x_out = node2vec.in_out_tensors()

In [24]:
prediction = link_classification(
    output_dim=1, output_act="sigmoid", edge_embedding_method="dot"
)(x_out)

link_classification: using 'dot' method to combine node embeddings into edge embeddings


In [25]:
model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

print(model.summary())

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
target_embedding (Embedding)    (None, 1, 512)       77345280    input_5[0][0]                    
__________________________________________________________________________________________________
context_embedding (Embedding)   (None, 1, 512)       77345280    input_6[0][0]                    
____________________________________________________________________________________________

In [28]:
history = model.fit(
    generator.flow(unsupervised_samples),
    epochs=epochs,
    verbose=1,
    use_multiprocessing=False,
    workers=4000,
    shuffle=True,
)



In [29]:
model.save('embedding.model')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: embedding.model\assets
