In [1]:
from google.colab import drive
drive.mount('/content/drive/')

ModuleNotFoundError: No module named 'google.colab'

In [129]:
import os
os.chdir("/content/drive/My Drive/eksthesis-final/")

In [130]:
!pip install stellargraph
!sudo apt-get install libmetis-dev
!pip install metis
!pip install matplotlib

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libmetis-dev is already the newest version (5.1.0.dfsg-5).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


In [131]:
import networkx as nx
import pandas as pd
import itertools
import json
import os

import numpy as np

from networkx.readwrite import json_graph

from sklearn.preprocessing import StandardScaler

import stellargraph as sg
from stellargraph.data import EdgeSplitter
from stellargraph.mapper import ClusterNodeGenerator,FullBatchLinkGenerator, GraphSAGELinkGenerator
from stellargraph.layer import GCN,LinkEmbedding,GraphSAGE,link_classification
from stellargraph import globalvar


from tensorflow.keras import backend as K

from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, feature_extraction, model_selection
from keras.layers import Dense
from stellargraph import datasets
from IPython.display import display, HTML
from tensorflow import keras
import matplotlib.pyplot as plt
import tensorflow as tf
import random
import datetime
import copy
import time
import pickle
from tensorflow.keras.callbacks import TensorBoard
import metis

from os import path

In [132]:
randomSeed = 234
tf.random.set_seed(randomSeed)
np.random.seed(randomSeed)
random.seed(randomSeed)

number_of_clusters = 12  # the number of clusters/subgraphs
clusters_per_batch = 2  # combine two cluster per batch
random_clusters = True  # Set to False if you want to use METIS for clustering

log_callback = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")+"/"
log_dir = "logs/history/"
tensorboard_callback=TensorBoard(log_dir=log_callback,
                         histogram_freq=1,
                         write_graph=True,
                         write_images=True,
                         update_freq='epoch',
                         profile_batch=2,
                         embeddings_freq=1)

class TimeHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.times = []

    def on_epoch_begin(self, batch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, batch, logs={}):
        self.times.append(time.time() - self.epoch_time_start)



class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)


In [133]:
class GraphCLGCNLGenerator():
    def __init__(self,G, dataset_name, model_name, batch_size = 20, num_samples = [20,10], clusters_no = 10, cpb = 2, lam =0.1,to_cluster=False):
        self.dataset_name = dataset_name
        self.clusters_no = clusters_no
        self.cpb = cpb
        self.lam =lam
        self.model_name = model_name
        self.batch_size = batch_size
        self.num_samples = num_samples
        self.inital_metrices ={"Name":[],"Metrices":[]};
        self.clusters = self.clusterGenerator(G,to_cluster)
        self.graph = self.linkDatasetGenerator(G)
        self.generator = self.graphGenerator(self.graph)
        self.createBatch()

    def clusterGenerator(self, graphs,randClust = False):
        if randClust:
            # We don't have to specify the cluster because the CluserNodeGenerator will take
            # care of the random clustering for us.
            self.clusters = self.clusters_no
        else:
            # We are going to use the METIS clustering algorith,
            print("Graph clustering using the METIS algorithm.")

            lil_adj = graphs.to_adjacency_matrix().tolil()
            adjlist = [tuple(neighbours) for neighbours in lil_adj.rows]

            edgecuts, parts = metis.part_graph(adjlist, self.clusters_no)
            parts = np.array(parts)
            clusters = []
            cluster_ids = np.unique(parts)
            for cluster_id in cluster_ids:
                mask = np.where(parts == cluster_id)
                clusters.append(node_ids[mask])
        return clusters

    def linkDatasetGenerator(self, graphs):
        edge_splitter_test = EdgeSplitter(graphs)
        self.G_test, self.edge_ids_test, self.edge_labels_test = edge_splitter_test.train_test_split(
            p=0.1, method="global", keep_connected=True
        )

        edge_splitter_val = EdgeSplitter(self.G_test)
        self.G_val, self.edge_ids_val, self.edge_labels_val = edge_splitter_val.train_test_split(
            p=0.05, method="global", keep_connected=True, seed=10
        )

        edge_splitter_train = EdgeSplitter(self.G_val)
        self.G_train, self.edge_ids_train, self.edge_labels_train = edge_splitter_train.train_test_split(
            p=0.15, method="global", keep_connected=True, seed=200
        )
        return self.G_train

    def graphGenerator(self, st_graph):
        if self.model_name in ["CLGCN","FGCN"]:
            return FullBatchLinkGenerator(st_graph, method="gcn")
        elif self.model_name == "SAGEL":
            return GraphSAGELinkGenerator(st_graph, self.batch_size, self.num_samples)


    def createModel(self,input_shape=[20,10], act_fun="relu",drp=0.5):
        self.dropout=drp
        self.input_shape = str(input_shape[0])+str(input_shape[1])
        if self.model_name in ["FGCN","CLGCN"]:
            graph_network = GCN(
                layer_sizes=input_shape, activations=[act_fun, act_fun], generator=self.generator, dropout=drp
            )
            x_inp, x_out = graph_network.in_out_tensors()
            prediction = LinkEmbedding(activation="relu", method="ip")(x_out)


        elif self.model_name == "SAGEL":
            graph_network = GraphSAGE(layer_sizes=input_shape, generator=self.generator, bias=True, dropout=drp)
            x_inp, x_out = graph_network.in_out_tensors()
            prediction = link_classification(
                output_dim=1, output_act="relu", edge_embedding_method="ip"
            )(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

    def optimize(self,optimizer = "ADAM", lr=0.001, loss = "keras.losses.binary_crossentropy", metrics = ["acc"]):
        self.learning_rate = lr

        if optimizer == "ADAM":
            self.optimizer = keras.optimizers.Adam(learning_rate=self.learning_rate)
        elif optimizer == "SGD":
            self.optimizer = keras.optimizers.SGD(learning_rate=self.learning_rate)
        self.loss = loss
        self.metric = metrics
        self.model.compile(
            optimizer=self.optimizer,
            loss=self.loss,
            metrics=self.metric,
        )
    def storeMetric(self,name, stage , data_flow):
        initial_metrics = self.model.evaluate(data_flow)
        self.inital_metrices["Name"].append(name+" "+stage)
        self.inital_metrices["Name"].append(initial_metrics)
        self.inital_metrices["Metrices"].append(self.model.metrics_names)

        print("\nTrain Set Metrics of the initial (untrained) model:")
        for name, val in zip(self.model.metrics_names,initial_metrics):
            print("\t{}: {:0.4f}".format(name, val))

    def createBatch(self):
        batches = []
        q= self.cpb
        if q > 1:
            # combine clusters
            cluster_indices = list(range(len(self.clusters)))
            random.shuffle(cluster_indices)

            for i in range(0, len(cluster_indices) - 1, q):
                cc = cluster_indices[i: i + q]
                tmp = []
                for l in cc:
                    tmp.extend(list(self.clusters[l]))
                batches.append(tmp)
        else:
            batches = copy.deepcopy(self.clusters)
        self.batches = batches

    def getCluster(self,i,target_edges_ids,edges_label):
        g_node_list = list(self.batches[i])
        target_edges_ids = target_edges_ids.tolist()
        edges_label = edges_label.tolist()
        flatten_node_ids = np.asarray(target_edges_ids).reshape(-1)
        unique_nodes = np.unique(flatten_node_ids)
        target_nodes_cluster = list(set(list(unique_nodes)).intersection(set(g_node_list)))
        # find edges_ids and edges label in batch
        cluster_edges_ids = []
        cluster_edges_label = []
        for index, edge in enumerate(target_edges_ids):
            if set(edge).issubset(set(target_nodes_cluster)):
                cluster_edges_ids.append(edge)
                cluster_edges_label.append(edges_label[index])
        cluster_edges_ids = np.asarray(cluster_edges_ids)
        cluster_edges_label = np.asarray(cluster_edges_label)

        return cluster_edges_ids, cluster_edges_label

    def storeInstance(self,hstry,time_data):
        self.folder_name = log_dir+self.model_name+"/"+self.dataset_name.upper()+"/"
        if not os.path.exists(self.folder_name):
            os.makedirs(self.folder_name)
        # try:
        #     # Create target Directory
        #     os.makedirs(folder_name)
        #     print("Directory ", folder_name, " Created ")
        # except FileExistsError:
        #     print("Directory ",folder_name, " already exists")
        filenames=self.folder_name+"history-lr="+str(self.learning_rate)+"-drop="+str(self.dropout)+"-input="+str(self.input_shape)+".csv"
        datas =pd.DataFrame(hstry.history)
        datas["time"]=time_data
        datas.to_csv(filenames)

        # with open(filename, 'wb') as file_pi:
        #     pickle.dump(hstry.history, file_pi)


    def train(self,epochs):
        train_gen = self.graphGenerator(self.G_train)
        val_gen = self.graphGenerator(self.G_val)
        timer = AverageMeter("Timer")
        if self.model_name in ["FGCN","SAGEL"]:
            train_flow = train_gen.flow(self.edge_ids_train, self.edge_labels_train)
            val_flow = val_gen.flow(self.edge_ids_val, self.edge_labels_val)
            time_callback = TimeHistory()
            start = time.time()
            history = self.model.fit(
                train_flow, epochs=epochs, validation_data=val_flow, verbose=1, shuffle=False, callbacks=[tensorboard_callback,time_callback]
            )
            timer.update(time.time() - start)
            times = time_callback.times
            history.params["time"]=timer.sum
            self.storeInstance(history,times)
            print("Total Time Elapsed = {0:.4f}".format(timer.sum))

        else:
            hist = None
            once= True
            max_accuracy = 0
            time_arr = []
            for epoch in range(epochs):
                trainLosses = AverageMeter('Loss')
                trainAcc = AverageMeter('Acc@1')
                valLosses = AverageMeter('Loss')
                valAcc = AverageMeter('Acc@1')
                local_timer = 0


                for i in range(len(self.batches)):
                    train_data = self.getCluster(i, self.edge_ids_train, self.edge_labels_train)
                    val_data = self.getCluster(i, self.edge_ids_val, self.edge_labels_val)
                    train_flow = train_gen.flow(train_data[0], train_data[1])
                    val_flow = val_gen.flow(val_data[0], val_data[1])
                    start = time.time()
                    history = self.model.fit(
                        train_flow, validation_data=val_flow, verbose=0, shuffle=True
                    )
                    temp_time = time.time()-start
                    local_timer+=temp_time
                    timer.update(temp_time)
                    trainAcc.update(history.history['acc'][0],len(train_data[1]))
                    trainLosses.update(history.history['loss'][0],len(train_data[1]))
                    valAcc.update(history.history['val_acc'][0], len(val_data[1]))
                    valLosses.update(history.history['val_loss'][0], len(val_data[1]))


                if once:
                    hist= copy.deepcopy(history)
                    hist.history['acc'][0]=trainAcc.avg
                    hist.history['loss'][0] = trainLosses.avg
                    hist.history['val_acc'][0] = valAcc.avg
                    hist.history['val_loss'][0] = valLosses.avg
                    once = False
                else:
                    hist.epoch.append(epoch)
                    hist.history['acc'].append(trainAcc.avg)
                    hist.history['loss'].append(trainLosses.avg)
                    hist.history['val_acc'].append(valAcc.avg)
                    hist.history['val_loss'].append(valLosses.avg)

                if trainAcc.avg>max_accuracy:
                    max_accuracy = trainAcc.avg
                    if not os.path.exists(log_callback):
                        os.mkdir(log_callback)

                    #self.model.save(log_callback)


                print("Epoch "+str(epoch+1)+"/"+str(epochs))
                time_arr.append(local_timer)
                print("1/1 [=============================] - {0:2.0f}s {0:0.4f}ms/step - loss: {1:f} - acc: {2:f} - val_loss: {3:f} - val_acc: {4:f}".format(local_timer,trainLosses.avg,trainAcc.avg,valLosses.avg,valAcc.avg))
            hist.params["Time"]=timer.sum
            hist.params["epochs"] = epochs
            self.storeInstance(hist,time_arr)
            print(hist.params)
            print("Total Time Elapsed = {0:.4f}".format(timer.sum))




    def test(self, G_graph,edges,labels):
        test_gen = self.graphGenerator(G_graph)
        self.test_flow = test_gen.flow(edges, labels)
        self.model.evaluate(self.test_flow)

    def get_f1_score(self,y_true, y_pred):

        y_true = np.array(y_true, dtype=np.float64)
        y_pred = np.array(y_pred, dtype=np.float64)

        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        recall = true_positives / (possible_positives + K.epsilon())
        f1_val = 2 * (precision * recall) / (precision + recall + K.epsilon())
        return f1_val

    def metrices(self, ytrue, ypred):
        from sklearn.metrics import roc_curve, auc
        score = self.get_f1_score(ytrue, ypred)
        nn_fpr_keras, nn_tpr_keras, nn_thresholds_keras = roc_curve(ytrue, ypred)
        auc_keras = auc(nn_fpr_keras, nn_tpr_keras)

        filename = self.folder_name+"f1_roc-lr="+str(self.learning_rate)+"-drop="+str(self.dropout)+"-input="+str(self.input_shape)+".csv"
        datas_auc = pd.DataFrame(columns=['F1','AUC'])
        datas_auc.loc[len(datas_auc)]=[float(score),float(auc_keras)]
        datas_auc.to_csv(filename)
        print('F1 Score for {}-{} : {:.4f}'.format(self.model_name,self.dataset_name,score))
        print('Area under ROC for {}-{} : {:.4f}'.format(self.model_name,self.dataset_name,auc_keras))





In [140]:
dataset = "cora"  # can also select 'pubmed'

if dataset == "cora":
    G, labels = datasets.Cora().load()
    display(HTML(datasets.Cora().description))
elif dataset == "pubmed":
    G, labels = datasets.PubMedDiabetes().load()
    display(HTML(datasets.PubMedDiabetes().description))
elif dataset == "citeseer":
    data = datasets.CiteSeer()
    G, node_subjects = data.load(largest_connected_component_only=True)
    display(HTML(datasets.CiteSeer().description))
G.info()

if dataset == "cora":
    train_size = 140
elif dataset == "pubmed":
    train_size = 60

node_ids = np.array(G.nodes())


model = GraphCLGCNLGenerator(G,model_name="FGCN",dataset_name=dataset, clusters_no = 10, cpb = 2, lam =0.1)

loss_fun = keras.losses.binary_crossentropy
model.createModel(input_shape =[64,64],drp=0.3)

model.optimize("ADAM",1e-2, loss_fun, ["acc"])
#model.model.summary()
print(model.model_name,model.dataset_name)
model.train(100)

model.test(model.G_test,model.edge_ids_test,model.edge_labels_test)
pred = model.model.predict(model.test_flow).ravel()
model.metrices(model.edge_labels_test,pred)


Graph clustering using the METIS algorithm.
** Sampled 542 positive and 542 negative edges. **
** Sampled 244 positive and 244 negative edges. **
** Sampled 696 positive and 696 negative edges. **
Using GCN (local pooling) filters...
FGCN cora
Using GCN (local pooling) filters...
Using GCN (local pooling) filters...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-140-316e7ea9b141>", line 31, in <module>
    model.train(100)
  File "<ipython-input-133-18ff9fc23dda>", line 171, in train
    train_flow, epochs=epochs, validation_data=val_flow, verbose=1, shuffle=False, callbacks=[tensorboard_callback,time_callback]
  File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 67, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/usr/lib/python3.7/posixpath.py", line 475, in relpath
    start_list = [x for x in abspath(start).split(sep) if x]
  File "/usr/lib/python3.7/posixpath.py", line 383, in abspath
    cwd = os.getcwd()
FileNotFoundError: [Errno 2] No such file or directory

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  Fi

FileNotFoundError: ignored

In [136]:
os.chdir("/content/drive/My Drive/eksthesis-final/logs/history/FGCN")
!python plot.py
os.chdir("/content/drive/My Drive/eksthesis-final/")

history-lr=0.001-drop=0.3-input=2010.csv
f1_roc-lr=0.001-drop=0.3-input=2010.csv
history-lr=0.001-drop=0.4-input=2010.csv
f1_roc-lr=0.001-drop=0.4-input=2010.csv
history-lr=0.001-drop=0.35-input=2010.csv
f1_roc-lr=0.001-drop=0.35-input=2010.csv
history-lr=0.001-drop=0.45-input=2010.csv
f1_roc-lr=0.001-drop=0.45-input=2010.csv
history-lr=0.001-drop=0.5-input=2010.csv
f1_roc-lr=0.001-drop=0.5-input=2010.csv
history-lr=0.01-drop=0.3-input=2010.csv
f1_roc-lr=0.01-drop=0.3-input=2010.csv
history-lr=0.0001-drop=0.3-input=2010.csv
f1_roc-lr=0.0001-drop=0.3-input=2010.csv
history-lr=0.0001-drop=0.35-input=2010.csv
f1_roc-lr=0.0001-drop=0.35-input=2010.csv
history-lr=0.01-drop=0.35-input=2010.csv
f1_roc-lr=0.01-drop=0.35-input=2010.csv
history-lr=0.01-drop=0.4-input=2010.csv
f1_roc-lr=0.01-drop=0.4-input=2010.csv
history-lr=0.0001-drop=0.4-input=2010.csv
f1_roc-lr=0.0001-drop=0.4-input=2010.csv
history-lr=0.0001-drop=0.45-input=2010.csv
f1_roc-lr=0.0001-drop=0.45-input=2010.csv
history-lr=0.01-

In [137]:
del model

In [139]:
!ls

final-CLGCN.ipynb  final-SAGEL.ipynb  main.py	   requirements.txt
final-FGCN.ipynb   history.zip	      plot.py	   requirement.txt
final.ipynb	   logs		      __pycache__  source.ipynb
