In [3]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

# Helper function for visualization.
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt

2.3.0+cu121
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pickle
import json
from torch_geometric.data import Data
from torch_geometric.nn import MetaPath2Vec
import sklearn.metrics
import sklearn
import os
from google.colab import drive


In [5]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
data = torch.load(os.path.join(os.getcwd(), 'drive','MyDrive','Summer Enrichment program ', 'updated_twitter_hetero_data.pt'))

In [7]:
data


HeteroData(
  user={
    x=[28839, 768],
    y=[28839],
  },
  tweet={ x=[118697, 768] },
  keyword={ x=[288, 768] },
  (user, to -->, user)={ edge_index=[2, 201142] },
  (user, to -->, keyword)={ edge_index=[2, 64997] },
  (keyword, <-- to, user)={ edge_index=[2, 64997] },
  (user, to -->, tweet)={ edge_index=[2, 162894] },
  (tweet, <-- to, user)={ edge_index=[2, 162894] },
  (tweet, to -->, keyword)={ edge_index=[2, 299997] },
  (keyword, <-- to, tweet)={ edge_index=[2, 299997] }
)

In [8]:
metapath = [
    ('user', 'to -->', 'user'),
        ('user', 'to -->', 'keyword'),
        ('keyword', '<-- to', 'user'),
        ('user', 'to -->', 'tweet'),
        ('tweet', '<-- to', 'user'),
        ('user', 'to -->', 'tweet'),
        ('tweet', 'to -->', 'keyword'),
        ('keyword', '<-- to', 'tweet'),
        ('tweet', '<-- to', 'user'),
]


In [9]:
import torch
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('runing on gpu')
else:
    device = torch.device('cpu')

In [25]:

model = MetaPath2Vec(data.edge_index_dict, embedding_dim=256,
                     metapath=metapath, walk_length=10, context_size=4,
                     walks_per_node=10, num_negative_samples=5,
                     sparse=True).to(device)

loader = model.loader(batch_size=64, shuffle=True, num_workers=2)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.0005) # lr was 0.001, 0.0001

classifier = torch.nn.Sequential(
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.Linear(128, 64),
    torch.nn.ReLU(),
    torch.nn.Linear(64, 4)
).to(device)

classifier_optimizer = torch.optim.Adam(classifier.parameters(), lr=0.0005, weight_decay=1e-5) # lr 0.001

In [26]:

def train(epoch, train_perm, valid_perm, test_perm, log_steps=200, eval_steps=1000):
    model.train()
    total_loss = 0
    for i, (pos_rw, neg_rw) in enumerate(loader):
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if (i + 1) % log_steps == 0:
            print(f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, Loss: {total_loss / log_steps:.4f}')
            total_loss = 0
        if (i + 1) % eval_steps == 0:
            acc, f1_score = test(test_perm)
            print(f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, Acc: {acc:.4f}, f1 score: {f1_score:.4f}')

    y = data['user'].y
    classifier_optimizer.zero_grad()


    # forward prog and get outputs + accuracy on train mask
    user_embs = model('user')
    output = classifier(user_embs)

    _, predicted = torch.max(output[train_perm], dim=1)
    acc_train = sklearn.metrics.accuracy_score(y[train_perm].cpu().numpy(), predicted.cpu().detach().numpy())

    # loss and backprog and updated weight
    pred_loss = F.cross_entropy(output[train_perm], y[train_perm])
    pred_loss.backward()
    classifier_optimizer.step()

    # gain accuracy on val mask
    loss_val = F.cross_entropy(output[valid_perm], y[valid_perm])
    acc_val = sklearn.metrics.accuracy_score(y[valid_perm].cpu().numpy(), torch.max(output[valid_perm], dim=1)[1].cpu().detach().numpy())
    print('Train loss {:.4f}, Train Accuracy: {:.4f}'.format(pred_loss.item(), acc_train),
          'Val loss: {:.4f}, Val Accuracy: {:.4f}'.format(loss_val.item(), acc_val))

@torch.no_grad()
def test(test_perm):
    model.eval()
    # get embeddings and labels
    z = model("user")
    y = data['user'].y

    # get accuracy and f1_score on test perm
    _, predicted = torch.max(z[test_perm], dim=1)
    accuracy = sklearn.metrics.accuracy_score(y[test_perm].cpu().numpy(), predicted.cpu().detach().numpy())
    f1_score = sklearn.metrics.f1_score(y[test_perm].cpu().numpy(), predicted.cpu().detach().numpy(), average="weighted")
    return accuracy, f1_score

for i in range(5):
    accuracy = []
    f1_scores = []
    perm = torch.randperm(28839)
    # create trian, val, and test mask
    train_perm, valid_perm, test_perm = perm[:int(28839 * 0.1)], perm[int(28839 * 0.1):int(28839 * 0.2)], perm[int(28839 *0.2):]
    print('Run: {}'.format(i + 1))
    for epoch in range(1, 10):
        train(epoch, train_perm, valid_perm, test_perm)
        acc, f1_score = test(test_perm)
        print(f'Epoch: {epoch}, Accuracy: {acc:.4f}, F1 score {f1_score:.4f}')
        accuracy.append(acc)
        f1_scores.append(f1_score)
    break

Run: 1
Epoch: 1, Step: 00200/451, Loss: 13.4677
Epoch: 1, Step: 00400/451, Loss: 12.9269
Train loss 1.4170, Train Accuracy: 0.1637 Val loss: 1.4193, Val Accuracy: 0.1671
Epoch: 1, Accuracy: 0.0046, F1 score 0.0090
Epoch: 2, Step: 00200/451, Loss: 12.2983
Epoch: 2, Step: 00400/451, Loss: 11.9040
Train loss 1.3870, Train Accuracy: 0.2400 Val loss: 1.3873, Val Accuracy: 0.2490
Epoch: 2, Accuracy: 0.0046, F1 score 0.0090
Epoch: 3, Step: 00200/451, Loss: 11.4221
Epoch: 3, Step: 00400/451, Loss: 11.0994
Train loss 1.3583, Train Accuracy: 0.3541 Val loss: 1.3595, Val Accuracy: 0.3433
Epoch: 3, Accuracy: 0.0045, F1 score 0.0088
Epoch: 4, Step: 00200/451, Loss: 10.7118
Epoch: 4, Step: 00400/451, Loss: 10.4303
Train loss 1.3271, Train Accuracy: 0.4929 Val loss: 1.3287, Val Accuracy: 0.4743
Epoch: 4, Accuracy: 0.0045, F1 score 0.0088
Epoch: 5, Step: 00200/451, Loss: 10.0916
Epoch: 5, Step: 00400/451, Loss: 9.8572
Train loss 1.3003, Train Accuracy: 0.5994 Val loss: 1.3015, Val Accuracy: 0.5846
Epo

In [13]:
print(acc, f1_score)

0.007541608876560333 0.01470250676699084
