# GraphSAGE Classifier

### Setup and load environment including necessary modules and files

In [None]:
!pip install dgl dglgo -f https://data.dgl.ai/wheels/repo.html

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
from dgl import DGLGraph
from dgl.data import DGLDataset
from dgl.nn.pytorch import conv as dgl_conv
import numpy as np
import os
import random
import itertools
import warnings
warnings.filterwarnings('ignore')
import pickle
adja_matrix, unique_keywords = pickle.load(open('/content/drive/MyDrive/hkust_colab/COMP4222/AdjacencyMatrices/200803_201103_AdjacencyMatrix_labeled.pickle', 'rb'))

### Classifier Classes and Fuctions, and Dataset Pre-processing

In [None]:
class KeywordDataset(DGLDataset):
    def __init__(self):
        super().__init__(name='keyword_data')

    def process(self):
        pols = np.zeros((adja_matrix.shape[0], 1))
        for index in pol_ids:
          #pols[index] = (pol_keywords.loc[[index]]['party'] * 2) - 1
          pols[index] = 1
        
        node_features = torch.from_numpy(pols)
        #node_features = F.one_hot(torch.arange(0,adja_matrix.shape[0]))
        node_labels = pol_keywords['party'].to_numpy()
        node_base_labels = np.random.randint(0, 2, adja_matrix.shape[0])

        for count, index in enumerate(pol_ids):
          node_base_labels[index] = node_labels[count]

        src, dst = np.nonzero(adja_matrix)

        self.graph = dgl.graph((src, dst), num_nodes=adja_matrix.shape[0])
        self.graph.ndata['feat'] = node_features
        self.graph.ndata['label'] = torch.from_numpy(node_base_labels)

        # If your dataset is a node classification dataset, you will need to assign
        # masks indicating whether a node belongs to training, validation, and test set.
        n_nodes = adja_matrix.shape[0]
        n_train = int(len(pol_ids) * 0.4)
        n_val = int(len(pol_ids) * 0.3)
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        val_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)

        for count, index in enumerate(pol_ids):
          if count < n_train:
            train_mask[index] = True
          elif count < n_train + n_val:
            val_mask[index] = True
          else:
            test_mask[index] = True

        self.graph.ndata['train_mask'] = train_mask
        self.graph.ndata['val_mask'] = val_mask
        self.graph.ndata['test_mask'] = test_mask

        self.features = node_features
        self.labels = torch.from_numpy(node_base_labels)
        self.num_labels = 2
        self.train_mask = train_mask
        self.val_mask = val_mask
        self.test_mask = test_mask

    def __getitem__(self, i):
        return self.graph

    def __len__(self):
        return 1

class GraphSAGEModel(nn.Module):
    def __init__(self,
                 in_feats,
                 n_hidden,
                 out_dim,
                 n_layers,
                 activation,
                 dropout,
                 aggregator_type):
        super(GraphSAGEModel, self).__init__()
        self.layers = nn.ModuleList()

        # input layer
        self.layers.append(dgl_conv.SAGEConv(in_feats, n_hidden, aggregator_type,
                                         feat_drop=dropout, activation=activation))
        # hidden layers
        for i in range(n_layers - 1):
            self.layers.append(dgl_conv.SAGEConv(n_hidden, n_hidden, aggregator_type,
                                             feat_drop=dropout, activation=activation))
        # output layer
        self.layers.append(dgl_conv.SAGEConv(n_hidden, out_dim, aggregator_type,
                                         feat_drop=dropout, activation=None))

    def forward(self, g, features):
        h = features
        for layer in self.layers:
            h = layer(g, h)
        return h

class NodeClassification(nn.Module):
    def __init__(self, gconv_model, n_hidden, n_classes):
        super(NodeClassification, self).__init__()
        self.gconv_model = gconv_model
        self.loss_fcn = torch.nn.CrossEntropyLoss()

    def forward(self, g, features, train_mask):
        logits = self.gconv_model(g, features)
        return self.loss_fcn(logits[train_mask], labels[train_mask])

def NCEvaluate(model, g, features, labels, test_mask):
    model.eval()
    with torch.no_grad():
        # compute embeddings with GNN
        logits = model.gconv_model(g, features)
        logits = logits[test_mask]
        test_labels = labels[test_mask]
        _, indices = torch.max(logits, dim=1)
        #print(indices)
        correct = torch.sum(indices == test_labels)
        acc = correct.item() * 1.0 / len(test_labels)
    return acc

def Train(model, graph, features, train_mask, val_mask, labels, n_epochs):
    for epoch in range(n_epochs):
        # Set the model in the training mode.
        model.train()
        # forward
        loss = model(graph, features, train_mask)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        """
        acc = NCEvaluate(model, graph, features, labels, val_mask)
        print("Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f}"
            .format(epoch, loss.item(), acc))
        """
def Test(model, graph, features, labels, test_mask):
    acc = NCEvaluate(model, graph, features, labels, test_mask)
    print('Testing Accuracy:', acc)
    return acc

### Parameter tuning test script

In [None]:
hidden_opts = [1, 4, 8, 16, 64]
dropout_opts = [0.01, 0.1, 0.3, 0.5, 0.8]
agg_type_opts = ['mean', 'pool', 'gcn']
weight_decay_opts = [0, 5e-4, 5e-2]
lr_opts = [1e-2, 1e-3, 1e-4]

params = itertools.product(hidden_opts, dropout_opts, agg_type_opts, weight_decay_opts, lr_opts)
with open('/content/drive/MyDrive/hkust_colab/COMP4222/results.txt', "a") as results:
  for paramset in params:
    random.seed(123)

    pol_keywords = unique_keywords[unique_keywords.party.notnull()]
    pol_keywords['party'].replace("Democratic Party", 0, True)
    pol_keywords['party'].replace("Republican Party", 1, True)
    pol_ids = list(pol_keywords.index)
    random.shuffle(pol_ids)

    dataset = KeywordDataset()

    features = torch.DoubleTensor(dataset.features).float()
    in_feats = features.shape[1]
    labels = torch.LongTensor(dataset.labels)
    n_classes = dataset.num_labels

    graph = dgl.remove_self_loop(dataset[0])
    # Hyperparameters
    n_hidden = paramset[0]
    n_layers = 2
    dropout = paramset[1]
    aggregator_type = paramset[2]

    gconv_model = GraphSAGEModel(in_feats,
                                n_hidden,
                                n_classes,
                                n_layers,
                                F.leaky_relu,
                                dropout,
                                aggregator_type)

    train_mask = graph.ndata['train_mask']
    val_mask = graph.ndata['val_mask']
    test_mask = graph.ndata['test_mask']

    model = NodeClassification(gconv_model, n_hidden, n_classes)

    # Training hyperparameters
    weight_decay = paramset[3]
    n_epochs = 50
    lr = paramset[4]

    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    Train(model, graph, features, train_mask, val_mask, labels, n_epochs)
    acc = Test(model, graph, features, labels, test_mask)
    results.write(str(paramset) + ",\t\tAcc: " + str(acc) + "\n")

#Harmonic Node Classification

###Setup and load environment including necessary modules and files

In [None]:
import pandas as pd
import numpy as np
import pickle
import networkx as nx
import matplotlib.pyplot as plt
import random
from networkx.algorithms import node_classification

adja_matrix, unique_keywords = pickle.load(open('/content/drive/MyDrive/hkust_colab/COMP4222/AdjacencyMatrices/200803_201103_AdjacencyMatrix_labeled.pickle', 'rb'))
pol_keywords = unique_keywords[unique_keywords.party.notnull()]

###Harmonic classification test script with data preprocessing

In [None]:
r_acc_cum = 0
d_acc_cum = 0

"""
Past results with diff iter-num
3: R-acc = 0.626, D-acc = 0.716
5: R-acc = 0.579, D-acc = 0.735
10: R-acc = 0.519, D-acc = 0.770
"""

for test_num in range(1, 11):
  KeywordGraph = nx.from_numpy_matrix(adja_matrix)

  # Label all but 30 nodes
  not_labelled = list((pol_keywords.sample(n=30, random_state=test_num)).index)
  repub_sample = 0
  democ_sample = 0

  for i, info in pol_keywords.iterrows(): 
    if i in not_labelled:
      if pol_keywords.loc[i]['party'][0] == 'R': repub_sample += 1
      else: democ_sample += 1

    if not (i in not_labelled):
      KeywordGraph.nodes[i]["party"] = info["party"][0]

  predicted = node_classification.harmonic_function(KeywordGraph, label_name="party", max_iter=5)
  repub_count = 0
  democ_count = 0


  repub_correct = 0
  democ_correct = 0
  for i, p in enumerate(predicted):
    if i in not_labelled:
      if pol_keywords.loc[i]['party'][0] == p:
        if p == 'R': repub_correct += 1
        else: democ_correct += 1

    if p == 'R': repub_count += 1
    else: democ_count += 1

  #print(f"Test No: {test_num}, Repub No: {repub_count}, Democ No: {democ_count}, Repub Samples: {repub_sample}, Democ Samples: {democ_sample}, Repub Correct: {repub_correct}, Democ Correct: {democ_correct}")
  repub_acc = repub_correct / repub_sample
  democ_acc = democ_correct / democ_sample
  r_acc_cum += repub_acc
  d_acc_cum += democ_acc
  print(f"Test No: {test_num}, Republican Acc: {repub_acc:f}, Democratic Acc: {democ_acc:f}")

print(f"10 Pass Results: Republican Acc: {r_acc_cum / 10:f}, Democratic Acc: {d_acc_cum / 10:f}")