In [1]:
import networkx as nx
import json
import random as rand
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


import numpy.linalg as linalg
import csv

In [2]:
# from google.colab import drive
# drive.mount('/content/gdrive')
root = 'datasets/'

In [3]:
G = nx.read_edgelist(root+'/git_edges.csv', nodetype=int, delimiter=',').to_undirected()

features = dict()
with open(root+'git_features.json', 'r') as f:
    features = json.load(f)

classes = dict()
with open(root+'git_target.csv', 'r') as f:
    csv_file = csv.reader(f, delimiter=',')
    next(csv_file) #skip header
    
    for row in csv_file:
        i = int(row[0])
        classes[i] = int(row[2])

In [4]:
feats = set()

for k,v in features.items():
        feats.update(v)

feature_len = len(feats)

def get_features(nodeid):
    _feats = features[nodeid]
    feats = np.zeros(feature_len)
    for f in _feats:
        feats[f] = 1
        
    return feats

In [5]:
print("edges:", G.size(), "\t nodes:", len(G.nodes())) 

edges: 289003 	 nodes: 37700


In [26]:
"""

 Generate subgraph
 
"""

# simulate a graph with some missing node features
missing_features = False
missing_percent = 0.20
missing = set()

def generate_subgraph(nodes, selected, mapping, step_size):
    subgraph = G.subgraph(nodes).copy()
    subgraph_np = nx.to_numpy_matrix(subgraph)
    
    map_index1 = 0
    if mapping:  
        map_index1 = len(mapping)
        
    map_index2 = map_index1 + step_size
    
    print(map_index1, map_index2)
    
    mapping_new = dict(zip(selected,np.arange(map_index1, map_index2)))
    mapping.update(mapping_new)
    
    print("mapsize:",len(mapping))
    
    mapping_reverse = {value:key for key, value in mapping.items()}
    
    subgraph_r = nx.relabel_nodes(subgraph, mapping)
    subgraph_r_np = nx.to_numpy_matrix(subgraph_r)

    print("edges:", subgraph_r.size(), "\t nodes:", len(subgraph_r.nodes())) 

    x = np.zeros((np.shape(subgraph_r_np)[0], feature_len)) # features
    y = np.zeros((np.shape(subgraph_r_np)[0],2)) # classes
    
    print("max:",max(subgraph_r.nodes()))
    
    for node in subgraph_r.nodes():
        if (missing_features and rand.random() < missing_percent) or node in missing:
            x[node] = np.zeros(feature_len)
            missing.add(node)
        else:
            x[node] = get_features(str(mapping_reverse[node]))
        
        cl = classes[mapping_reverse[node]]
        val = np.zeros(2)
        val[cl] = 1
        
        y[node] = val
    
    del mapping_reverse, subgraph_r_np

    return subgraph_r, x, y, mapping

"""

 Feature Extraction

"""

def feature_extraction(subgraph, x):
    subgraph_np = torch.from_numpy(nx.to_numpy_matrix(subgraph).astype(np.float32)).to_sparse()
    I = torch.eye(len(subgraph))
    A = I + subgraph_np # G + identity
    A = A.to_sparse()
    D = np.zeros((np.shape(subgraph_np)[0])) # Degree matrix
    x = torch.from_numpy(x.astype(np.float32))


    for node in subgraph.nodes():
        D[node] = 1.0 / np.sqrt(subgraph.degree(node) + 1)

    D = torch.from_numpy(np.diag(D).astype(np.float32))
    D = D.to_sparse()
    
    S = D * A * D
    n_layers = 5
    #S_k = S

    r = x
    for _ in range(n_layers):
        r = torch.spmm(S, r)
  
    #print(S_k.shape, x.shape)
    
    #r = S_k @ x
    
    return r


In [8]:
nodes = G.nodes()
final_subgraphs = []
final_features = []
final_classes = []
visited = []
mapping = dict()
step_size = int(0.20 * len(G.nodes()))

while len(visited) < 1.0 * len(G.nodes()):
    selected = rand.sample(nodes, step_size)
    visited = visited + selected
    nodes = [ele for ele in nodes if ele not in selected]
    
    subgraph, x, y, mapping = generate_subgraph(visited, selected, mapping, step_size)
    final_subgraphs.append(subgraph)
    final_features.append(x)
    final_classes.append(y)
    
    

0 7540
mapsize: 7540
edges: 11940 	 nodes: 7540
max: 7539
7540 15080
mapsize: 15080
edges: 48016 	 nodes: 15080
max: 15079
15080 22620
mapsize: 22620
edges: 102189 	 nodes: 22620
max: 22619
22620 30160
mapsize: 30160
edges: 184881 	 nodes: 30160
max: 30159
30160 37700
mapsize: 37700
edges: 289003 	 nodes: 37700
max: 37699


In [9]:
len(final_subgraphs)

5

In [27]:
r_0 = feature_extraction(final_subgraphs[0], final_features[0])

In [28]:
class SGCModel(nn.Module):
    def __init__(self, feature_len):
        super(SGCModel, self).__init__()
        self.lin1 = nn.Linear(feature_len, 800)
        self.lin2 = nn.Linear(800,200)
        self.lin3 = nn.Linear(200,2)
        self.sigmoid = nn.Sigmoid()

    def forward(self, inputX):
        out = self.sigmoid(self.lin3(F.relu(self.lin2(F.relu(self.lin1(inputX))))))
        return out



In [29]:
def validTestData(i, ss):
    r = feature_extraction(final_subgraphs[i], final_features[i])
    yGT = torch.from_numpy(final_classes[i].astype(np.float32))

    # 0: 0 - 7499
    # 1: 7500 - 14999
    # 2 : 15000 - 22499


    # 7500 ... 15000

    print(ss)
    x = r[ss:]
    y = yGT[ss:]
    
    # 0 ... 7500
    px = r[:ss]
    py = yGT[:ss]

    vx = torch.chunk(x, 2)[0]
    tx = torch.chunk(x, 2)[1]

    vy = torch.chunk(y, 2)[0]
    ty = torch.chunk(y, 2)[1]


    return px, py, vx, vy, tx, ty

In [None]:
"""

Online learning


"""
import copy

model = SGCModel(feature_len)
model_copy = None
moment = 0.90
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
#optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, momentum=moment)
num_epoch = 25
batch_num = 10

#loss_function = torch.nn.MSELoss()
loss_function = torch.nn.BCELoss()

xprev, yprev, validSet_x, validSet_y, testSet_x, testSet_y = validTestData(1, step_size)
train = r_0
trainSet = torch.chunk(train, batch_num)

ytrain = torch.from_numpy(final_classes[0].astype(np.float32))
trainGT = torch.chunk(ytrain, batch_num)

model.train()

"""
Weight warmup
"""
for epoch in range(0, 3):
    for batch_i in range(batch_num):
        y_predicted = model.forward(trainSet[batch_i])
        loss = loss_function(y_predicted, trainGT[batch_i])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
          #print('Warm Up Epoch:', epoch, 'batch', batch_i, loss.data.item())
    



#optimizerPW = torch.optim.SGD(model.parameters(), lr=0.001, momentum=moment)

lrs = [0.01, 0.005, 0.0025, 0.00125]
#lrs = [0.001, 0.0005, 0.00025]

current_step_size = step_size
for i in range(1,len(final_subgraphs)):
    xtrain = []
    ytrain = []

    if i == 1:
        xtrain = r_0
        ytrain = torch.from_numpy(final_classes[0].astype(np.float32))
    else:
        xprev, yprev, validSet_x, validSet_y, testSet_x, testSet_y = validTestData(i, current_step_size)
        xtrain = xprev
        ytrain = yprev

    #print(xtrain.shape, ytrain.shape)

    current_step_size += step_size

    trainSet = torch.chunk(xtrain, batch_num)
    trainGT = torch.chunk(ytrain, batch_num)

    model.train()

    optimizerPW = torch.optim.Adam(model.parameters(), lr=lrs[i-1])
    for epoch in range(0, num_epoch):
        lossAvg = 0
        batchesLoss = []
        for batch_i in range(batch_num):
            y_predicted = model.forward(trainSet[batch_i])

            #print(y_predicted.shape, trainGT[batch_i].shape)
            loss = loss_function(y_predicted, trainGT[batch_i])
            optimizerPW.zero_grad()
            loss.backward()
            optimizerPW.step()
            batchesLoss.append(loss.data.item())
            lossAvg += loss.data.item()
              ##TODO: Create a simple function to compute which values match ground truth manually, and print the "Training Accuracy" per epoch
        if epoch % 10 == 0:
            with torch.no_grad():
                model.eval()
                valOut = model.forward(validSet_x)
                loss = loss_function(valOut, validSet_y)

                #print(valOut.shape, validSet_x.shape)
                _,predicted = torch.max(valOut, 1)
                _,actual = torch.max(validSet_y,1)

                print("validation loss:", loss, "valid accuracy:", float((predicted == actual).sum().item() / len(predicted)))

            model.train()

        #print('Epoch[{}/{}], loss: {:.6f}'
              #.format(epoch + 1, num_epoch, lossAvg/batch_num))
    
    """
    NOW WE TEST
    """
    model.eval()

    _,predicted = torch.max(model.forward(testSet_x), 1)
    _,actual = torch.max(testSet_y,1)
    print("accuracy:", (predicted == actual).sum().item() / len(predicted))
    
    if(i == 1):
        model_copy = copy.deepcopy(model)

7540
validation loss: tensor(0.5430) valid accuracy: 0.7604774535809019
validation loss: tensor(0.7683) valid accuracy: 0.7819628647214855
validation loss: tensor(1.4375) valid accuracy: 0.48010610079575594
accuracy: 0.7981432360742705
15080
validation loss: tensor(0.7277) valid accuracy: 0.789920424403183
validation loss: tensor(0.7150) valid accuracy: 0.7976127320954907
validation loss: tensor(0.8862) valid accuracy: 0.8039787798408488
accuracy: 0.8180371352785146
22620
validation loss: tensor(0.7490) valid accuracy: 0.8092838196286473
validation loss: tensor(0.5905) valid accuracy: 0.8047745358090186
validation loss: tensor(0.6738) valid accuracy: 0.8058355437665783
accuracy: 0.8090185676392573


In [21]:
"""

Train once, test on all


"""

#lrs = [0.001, 0.0005, 0.00025]
lrs = [0.01, 0.0005, 0.00025]

current_step_size = step_size
for i in range(1,len(final_subgraphs)):
    _, _, _, _, testSet_x, testSet_y = validTestData(i, current_step_size)

    current_step_size += step_size
    
    """
    NOW WE TEST
    """
    model_copy.eval()

    _,predicted = torch.max(model_copy.forward(testSet_x), 1)
    _,actual = torch.max(testSet_y,1)
    print("accuracy:", (predicted == actual).sum().item() / len(predicted))

7540
accuracy: 0.7989389920424403
15080
accuracy: 0.8026525198938992
22620
accuracy: 0.7782493368700265
30160
accuracy: 0.7716180371352785
