In [15]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

# Helper function for visualization.
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt

2.3.0+cu121
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [19]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import json
import os
import pickle
from google.colab import drive
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv, GATConv, GINConv, MLP
import sklearn.metrics as metrics

In [20]:
def json_open(file_path):
  with open(file_path, 'r') as f:
    data = json.load(f)
  return data


def pickle_open(file_path):
  with open(file_path, 'rb') as f:
    data = pickle.load(f)
  return data

In [21]:
from google.colab import drive
drive.mount('/content/drive')
os.path.exists('/content/drive/Shareddrives/SEP/Data/edges.json')
path = '/content/drive/Shareddrives/SEP/Data'
edges = json_open(os.path.join(path,'edges.json'))
labels = json_open(os.path.join(path,'labels.json'))
keyword_embeddings = pickle_open(os.path.join(path,'keyword_embeddings.pkl'))
tweet_embeddings = pickle_open(os.path.join(path,'tweet_embeddings.pkl'))
user_embeddings = pickle_open(os.path.join(path,'user_embeddings.pkl'))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
# get all ids and add them to a dictionary with correspoding index
all_ids = list(list(user_embeddings['ids']) + list(tweet_embeddings['ids']) + list(keyword_embeddings['ids']))
ids_to_index = {ids: i for i, ids in enumerate(all_ids)}

# concatenate all embeddings and add convert to tensor
all_embeddings = np.concatenate([user_embeddings['embeddings'], tweet_embeddings['embeddings'], keyword_embeddings['embeddings']], axis=0)
x = torch.tensor(all_embeddings, dtype=torch.float)

# make edge index correspoding to source_id to target_id
edge_index = [[], []]
for edge in edges:
  source_id = edge['source_id']
  target_id = edge['target_id']
  edge_index[0].append(ids_to_index[source_id])
  edge_index[1].append(ids_to_index[target_id])

edge_index = torch.tensor(edge_index, dtype=torch.long)

# get all the lables in labels
labels_val = set()
for value in labels.values():
  if value not in labels_val:
    labels_val.add(value)

# from the ids list get the label corresponding to ids, if no id is found it is negative (keyword_ids are not in labels)
labels_list = [labels.get(ids, 'Negative') for ids in all_ids]

# make label values to numeric
labels_dict = { label: i for i, label in enumerate(labels_val)}
labels_list = [labels_dict[label] for label in labels_list]

y = torch.tensor(labels_list, dtype=torch.long)




In [23]:
# create mask
num_nodes = x.size(0)
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
num_train = int(0.1 * num_nodes)
num_val = int(0.1 * num_nodes)
train_mask[:num_train] = True
val_mask[num_train:num_train + num_val] = True
test_mask[num_train + num_val:] = True

In [7]:
from torch_geometric.data import Data

data = Data(x=x, edge_index=edge_index, y=y, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
print(data)

Data(x=[147824, 768], edge_index=[2, 729030], y=[147824], train_mask=[147824], val_mask=[147824], test_mask=[147824])


In [8]:
torch.save(data, os.path.join(os.getcwd(), 'drive', 'MyDrive','Summer Enrichment program ','twitter_data_homogeneous' ))


In [9]:
data = torch.load(os.path.join(os.getcwd(), 'drive', 'MyDrive','Summer Enrichment program ','twitter_data_homogeneous' ))

In [24]:
class GCN(nn.Module):
    def __init__(self, input_feats, num_hidden, num_classes, dropout = 0.5):
        super(GCN, self).__init__()

        self.gc1 = GCNConv(input_feats, num_hidden)
        self.gc2 = GCNConv(num_hidden, num_classes)
        self.dropout = dropout

    def forward(self, x, edge_index):
        x = F.relu(self.gc1(x, edge_index))
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(x, edge_index)
        return F.log_softmax(x, dim=1)

class GAT(nn.Module):
    def __init__(self, features, hidden, classes, heads=4, dropout=0.5):
        super(GAT, self).__init__()

        self.dropout = dropout
        self.gat1 = GATConv(features, hidden, heads=heads, dropout=dropout)
        self.gat2 = GATConv(hidden * heads, classes, heads=1, concat=True, dropout=dropout)

    def forward(self, x, edge_index):
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gat1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gat2(x, edge_index)

        return F.log_softmax(x, dim=1)


In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [26]:
model = GCN(input_feats=data.x.size(1), num_hidden=16, num_classes=data.y.max().item() + 1  ).to(device)
model_gat = GAT(features=data.x.size(1), hidden=16, classes=data.y.max().item() + 1).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
data.to(device)

Data(x=[147824, 768], edge_index=[2, 729030], y=[147824], train_mask=[147824], val_mask=[147824], test_mask=[147824])

In [27]:
def train(model, epoch, features, edge_index, labels, train_mask, val_mask):
    model.train()

    # training
    optimizer.zero_grad()
    output = model(features, edge_index)
    loss_train = F.nll_loss(output[train_mask], labels[train_mask])

    # accuarcy
    preds = output[train_mask].max(1)[1].type_as(labels[train_mask])
    correct = preds.eq(labels[train_mask]).double()
    correct = correct.sum()
    acc_train = correct / len(labels[train_mask])

    loss_train.backward()
    optimizer.step()

    # validation
    loss_val = F.nll_loss(output[val_mask], labels[val_mask])
    preds = output[val_mask].max(1)[1].type_as(labels[val_mask])
    correct = preds.eq(labels[val_mask]).double()
    correct = correct.sum()
    acc_val = correct / len(labels[val_mask])

    if (epoch + 1) % 100 == 0:
      print('Epoch: {:04d}'.format(epoch+1),
            'loss_train: {:.4f}'.format(loss_train.item()),
            'acc_train: {:.4f}'.format(acc_train.item()),
            'loss_val: {:.4f}'.format(loss_val.item()),
            'acc_val: {:.4f}'.format(acc_val.item()))


def test(model, features, edge_index, labels, test_mask):
    model.eval()
    output = model(features, edge_index)
    loss_test = F.nll_loss(output[test_mask], labels[test_mask])

    preds = output[test_mask].max(1)[1].type_as(labels[test_mask])
    correct = preds.eq(labels[test_mask]).double()
    correct = correct.sum()
    acc_test = correct / len(labels[test_mask])

    print("Test set results:",
          "loss= {:.4f}".format(loss_test.item()),
          "accuracy= {:.4f}".format(acc_test.item()))
    return acc_test.item()

In [23]:
# gcn
test_acc_list = []
for i in range(5):
  for epoch in range(200):
      train(model, epoch, data.x, data.edge_index, data.y, data.train_mask, data.val_mask)
  acc_test = test(model, data.x, data.edge_index, data.y, data.test_mask)
  test_acc_list.append(acc_test)


Epoch: 0100 loss_train: 0.5271 acc_train: 0.8361 loss_val: 0.3187 acc_val: 0.9576
Epoch: 0200 loss_train: 0.5083 acc_train: 0.8351 loss_val: 0.3012 acc_val: 0.9570
Test set results: loss= 0.2793 accuracy= 0.9976
Epoch: 0100 loss_train: 0.4976 acc_train: 0.8349 loss_val: 0.3005 acc_val: 0.9526
Epoch: 0200 loss_train: 0.4931 acc_train: 0.8376 loss_val: 0.2891 acc_val: 0.9527
Test set results: loss= 0.2417 accuracy= 0.9976
Epoch: 0100 loss_train: 0.4852 acc_train: 0.8399 loss_val: 0.2992 acc_val: 0.9508
Epoch: 0200 loss_train: 0.4836 acc_train: 0.8395 loss_val: 0.2820 acc_val: 0.9499
Test set results: loss= 0.2319 accuracy= 0.9976
Epoch: 0100 loss_train: 0.4829 acc_train: 0.8402 loss_val: 0.2776 acc_val: 0.9502
Epoch: 0200 loss_train: 0.4791 acc_train: 0.8431 loss_val: 0.2774 acc_val: 0.9472
Test set results: loss= 0.2225 accuracy= 0.9976
Epoch: 0100 loss_train: 0.4819 acc_train: 0.8410 loss_val: 0.2695 acc_val: 0.9474
Epoch: 0200 loss_train: 0.4782 acc_train: 0.8422 loss_val: 0.2997 acc_

In [28]:
print('Test accuracy: {:.3f} +/- {:.3f}'.format(np.mean(test_acc_list), np.std(test_acc_list)))

Test accuracy: 0.998 +/- 0.000


In [None]:
# GAT
test_acc_gat_list = []
for i in range(5):
  for epoch in range(200):
      train(model_gat, epoch, data.x, data.edge_index, data.y, data.train_mask, data.val_mask)
  test_acc_gat = test(model_gat, data.x, data.edge_index, data.y, data.test_mask)
  test_acc_gat_list.append(test_acc_gat)

Epoch: 0100 loss_train: 1.3839 acc_train: 0.2496 loss_val: 1.3857 acc_val: 0.2389
Epoch: 0200 loss_train: 1.3841 acc_train: 0.2479 loss_val: 1.3852 acc_val: 0.2347
Test set results: loss= 1.3823 accuracy= 0.2076
Epoch: 0100 loss_train: 1.3878 acc_train: 0.2330 loss_val: 1.3885 acc_val: 0.2220
Epoch: 0200 loss_train: 1.3826 acc_train: 0.2527 loss_val: 1.3831 acc_val: 0.2436
Test set results: loss= 1.3823 accuracy= 0.2076
