In [1]:
import pandas as pd
import random
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import dgl
from dgl import function as fn
from dgl import DGLGraph

import torch as th
import torch.nn as nn
import torch.nn.functional as F
import networkx as nx

In [2]:

#set gpu if available
if th.cuda.is_available():
    print("GPU is available")
    #device = th.device("cuda")
    device = th.device("cuda")
else:
    print("GPU not available, CPU used")
    device = th.device("cpu")

GPU not available, CPU used


In [3]:
df = pd.read_csv("training_data.csv").drop('Unnamed: 0', axis = 1)

In [4]:
X = df.drop(columns=['homebuyers_x']).values
y = df['homebuyers_x'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.16, random_state=0)

In [5]:
# This function expects two numpy vectors of equal size, and outputs average square difference
def loss_function_1(y_pred, y_labels):
    size = y_pred.shape[0]
    differences = y_pred - y_labels
    differences_square = differences*differences
    differences_square_sum = th.sum(differences_square)
    
    return differences_square_sum/size

In [6]:
#operation for neigbors
class NodeApplyModule(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(NodeApplyModule, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)
        self.activation = activation

    def forward(self, node):
        h = self.linear(node.data['h'])
        if self.activation is not None:
            h = self.activation(h)
        return {'h' : h}
    
#gcn layer in network
class GCN(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(GCN, self).__init__()
        self.apply_mod = NodeApplyModule(in_feats, out_feats, activation)

    def forward(self, g, feature):
        g.ndata['h'] = feature
        g.pull(g.nodes())
        g.apply_nodes(self.apply_mod)
        
        return g.ndata.pop('h')
    
#network
class DeepGCN(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(DeepGCN, self).__init__()
        self.gcn1 = GCN(in_feats, 256, th.relu)
        self.gcn2 = GCN(256, 128, th.relu)
        self.gcn3 = GCN(128, 64, th.sigmoid)
        self.gcn4 = GCN(64, out_feats, None)

    def forward(self, g, features):
        x = self.gcn1(g, features)
        x = self.gcn2(g, x)
        x = self.gcn3(g, x)
        
        return self.gcn4(g, x)

In [7]:

ds_features = th.FloatTensor(X).to(device) #convert to pytorch data type and add to cpu/gpu
ds_labels = th.LongTensor(y).to(device)

# add self loop for the sum of festures
ds_g = DGLGraph()
ds_g.add_nodes(len(ds_features))

ds_g.add_edges(ds_g.nodes(), ds_g.nodes())

# add row index as column
df_graph = pd.read_csv("graph.csv").drop('Unnamed: 0', axis = 1)
df_graph = df_graph[(df_graph['0'] < len(df)) & (df_graph['1'] < len(df)) & (df_graph['2'] < len(df))].reset_index()

########### Add graph here

for idx in range(len(df_graph)):
    ds_g.add_edges(df_graph.iloc[idx]['index'], df_graph.iloc[idx][1:].values)

########### Add graph here

ds_g.ndata['features'] = ds_features
ds_g.ndata['t_labels'] = ds_labels

m_func = fn.copy_src(src='h', out='m')
m_reduce_func = fn.sum(msg='m', out='h')

In [8]:
c = th.randperm(len(ds_g.nodes())) #shuffle
rand_nodes = ds_g.nodes()[c]


# choices = set([random.choice(ds_g.nodes())])
# out = set()

# while(len(choices) <= int(len(ds_g) * .80)):
#     for i in choices:
#         out = out.union(ds_g.predecessors(i), ds_g.successors(i))
#         if(len(out) + len(choices) > int(len(ds_g) * .80)):
#             break

#     choices = choices.union(out)
#     out = set()

# choices = list(choices)

train_g = ds_g.subgraph(rand_nodes[:int(len(ds_g) * .80)])#80 percent of all the nodes
train_g.copy_from_parent()

test_g = ds_g.subgraph(rand_nodes[int(len(ds_g) * .20):])#20 percent of all the nodes
test_g.copy_from_parent()
test_g.register_message_func(m_func)

test_g.register_reduce_func(m_reduce_func)


In [9]:
#constant parameters
NUMBER_OF_LABELS = th.unique(ds_g.ndata["t_labels"]).size(0)
EPOCH = 300

model = DeepGCN(ds_features.size()[1], NUMBER_OF_LABELS).to(device)

opt = th.optim.Adam(model.parameters(), lr=1e-3)# only run once

In [10]:
def evaluate(model, g, features, labels):
    model.eval()
    with th.no_grad():
        logits = model(g, features)
        _, indices = th.max(logits, dim=1)
        correct = th.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)

In [None]:
hoops = 3

losses = np.array([])

#train
for epoch in range(EPOCH):
    
    if epoch % 50 == 0:
        print(epoch)
    """
    choices = set([random.choice(train_g.nodes()), random.choice(train_g.nodes())])
    out = set()
    
    for neu in range(hoops):
        for i in choices:
            out = out.union(train_g.predecessors(i), train_g.successors(i))
        
        choices = choices.union(out)
        out = set()
    
    choices = list(choices)"""
        
    c = th.randperm(len(train_g.nodes())) #shuffle
    choices = train_g.nodes()[c]
    
    sub_graph = train_g.subgraph(choices[:int(len(train_g)*.10)])
    sub_graph.copy_from_parent()
    sub_graph.register_message_func(m_func)
    sub_graph.register_reduce_func(m_reduce_func)

    feats = sub_graph.ndata['features']
    labs = sub_graph.ndata['t_labels'] #true label

    out = model(sub_graph, feats)
    
    out = th.log_softmax(out, 1)
    loss = F.nll_loss(out, labs)

    opt.zero_grad()
    loss.backward()
    opt.step()
    
    losses = np.append(losses, evaluate(model, test_g, test_g.ndata['features'], test_g.ndata['t_labels']))
    
plt.figure(figsize = (10, 8))
plt.plot(losses)
plt.grid()
plt.show()

24
49
