In [1]:
"""
pip install numpy pandas scikit-learn torch dgl
"""

'\npip install numpy pandas scikit-learn torch dgl\n'

In [125]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.5.1-cp39-cp39-macosx_10_9_x86_64.whl.metadata (12 kB)
Using cached scikit_learn-1.5.1-cp39-cp39-macosx_10_9_x86_64.whl (12.1 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.5.1


In [1]:
import numpy as np

import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
from dgl.data import CoraGraphDataset

import requests
import zipfile
import io
from sklearn.preprocessing import LabelEncoder

In [2]:
dgl.__version__

'1.1.2post1'

In [None]:
"""default dataloader and example codes """

dataset = CoraGraphDataset()
graph = dataset[0]
print(f"Number of nodes: {graph.num_nodes()}")
print(f"Number of edges: {graph.num_edges()}")
print(f"Node features shape: {graph.ndata['feat'].shape}")
print(f"Number of classes: {dataset.num_classes}")

print(f"dataset {type(dataset)}, graph {type(graph)}")

In [3]:
# Download the Cora dataset
# url = "https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz"
# response = requests.get(url)

# # Extract the dataset
# with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
#     zip_ref.extractall("cora")

# Load the dataset into DataFrames
content = pd.read_csv("../data/cora/cora.content", sep="\t", header=None)
citations = pd.read_csv("../data/cora/cora.cites", sep="\t", header=None)

In [85]:
"""
citations<===src-2-dst df
content<=== feature df
"""

'\ncitations<===src-2-dst df\ncontent<=== feature df\n'

In [4]:
unique_paper_ids = content[0].unique()

# Create a mapping from original IDs to new scaled IDs
id_mapping = {old_id: new_id for new_id, old_id in enumerate(unique_paper_ids)}

# Apply the mapping to the content DataFrame
content[0] = content[0].map(id_mapping)

# Apply the mapping to both source and target columns in the citations DataFrame
citations[0] = citations[0].map(id_mapping)
citations[1] = citations[1].map(id_mapping)

In [5]:
# Create a mapping from original IDs to new scaled IDs
feat_mapping = {
    old_id: new_id for new_id, old_id in enumerate(content.iloc[:, -1].unique())
}

# Apply the mapping to the content DataFrame
content.iloc[:, -1] = content.iloc[:, -1].map(feat_mapping)

In [88]:
# citations[0].values
# content

In [6]:
src = torch.from_numpy(citations[0].values)
dst = torch.from_numpy(citations[1].values)

In [7]:
data_dict = {("paper", "cites", "paper"): (src, dst)}
num_nodes_dict = {"paper": len(list(set(content.values[:, 0])))}

In [8]:
custom_graph = dgl.heterograph(data_dict, num_nodes_dict=num_nodes_dict)

In [9]:
print(type(custom_graph))
print(custom_graph)

<class 'dgl.heterograph.DGLGraph'>
Graph(num_nodes=2708, num_edges=5429,
      ndata_schemes={}
      edata_schemes={})


In [93]:
# content.values[:,-1]
# content

In [10]:
node_data = {}
node_data["paper"] = torch.tensor(np.array(content.values[:, 1:-1], dtype=np.int64))
node_data["label"] = torch.tensor(np.array(content.values[:, -1], dtype=np.int64))
node_data

{'paper': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'label': tensor([0, 1, 2,  ..., 5, 6, 0])}

In [11]:
node_data["paper"].shape

torch.Size([2708, 1433])

In [12]:
custom_graph.nodes["paper"].data["feat"] = node_data["paper"]
custom_graph.nodes["paper"].data["label"] = node_data["label"]

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.
Number of nodes: 2708
Number of edges: 10556
Node features shape: torch.Size([2708, 1433])
Number of classes: 7


dataset <class 'dgl.data.citation_graph.CoraGraphDataset'>, graph <class 'dgl.heterograph.DGLGraph'>


In [13]:
g = custom_graph

In [14]:
# Define the sizes for each split
num_users = g.num_nodes("paper")
train_size = int(0.6 * num_users)
val_size = int(0.2 * num_users)
test_size = num_users - train_size - val_size  # Remaining nodes for the test set

print(f"Train size: {train_size}, Validation size: {val_size}, Test size: {test_size}")

Train size: 1624, Validation size: 541, Test size: 543


In [15]:
# Initialize masks with zeros (False)
train_mask = torch.zeros(num_users, dtype=torch.bool)
val_mask = torch.zeros(num_users, dtype=torch.bool)
test_mask = torch.zeros(num_users, dtype=torch.bool)

# Randomly permute the node indices
indices = np.random.permutation(num_users)

# Assign the masks
train_mask[indices[:train_size]] = True
val_mask[indices[train_size : train_size + val_size]] = True
test_mask[indices[train_size + val_size :]] = True

# Add the masks to the graph as node data
g.nodes["paper"].data["train_mask"] = train_mask
g.nodes["paper"].data["val_mask"] = val_mask
g.nodes["paper"].data["test_mask"] = test_mask

In [16]:
print("Training mask:", g.nodes["paper"].data["train_mask"])
print("Validation mask:", g.nodes["paper"].data["val_mask"])
print("Test mask:", g.nodes["paper"].data["test_mask"])

# Verify the counts
print("Number of training nodes:", g.nodes["paper"].data["train_mask"].sum().item())
print("Number of validation nodes:", g.nodes["paper"].data["val_mask"].sum().item())
print("Number of test nodes:", g.nodes["paper"].data["test_mask"].sum().item())

Training mask: tensor([ True, False,  True,  ...,  True,  True, False])
Validation mask: tensor([False,  True, False,  ..., False, False, False])
Test mask: tensor([False, False, False,  ..., False, False,  True])
Number of training nodes: 1624
Number of validation nodes: 541
Number of test nodes: 543


In [17]:
from dgl.nn import GraphConv


class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats, allow_zero_in_degree=True)
        self.conv2 = GraphConv(h_feats, num_classes, allow_zero_in_degree=True)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [18]:
"""dataloader to load huge datasets<--needs to correct later"""

from dgl.dataloading import NodeDataLoader

train_nids = g.nodes["paper"].data["train_mask"].nonzero().squeeze()
val_nids = g.nodes["paper"].data["val_mask"].nonzero().squeeze()

train_dataloader = NodeDataLoader(
    g,
    {"paper": train_nids},
    sampler=dgl.dataloading.MultiLayerNeighborSampler([15, 10]),
    batch_size=32,
    shuffle=True,
    num_workers=4,
)

val_dataloader = NodeDataLoader(
    g,
    {"paper": val_nids},
    sampler=dgl.dataloading.MultiLayerNeighborSampler([15, 10]),
    batch_size=32,
    shuffle=False,
    num_workers=4,
)

ImportError: cannot import name 'NodeDataLoader' from 'dgl.dataloading' (/Users/hasan.iqbal/anaconda3/envs/gnn_env/lib/python3.9/site-packages/dgl/dataloading/__init__.py)

7

In [19]:
# model = GCN(g.ndata["feat"].shape[1], 16, dataset.num_classes)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

model = GCN(g.ndata["feat"].shape[1], 16, len(set(list(g.ndata["label"].numpy()))))
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [20]:
def train(model, graph, epochs):
    features = graph.ndata["feat"]
    labels = graph.ndata["label"]
    train_mask = graph.ndata["train_mask"]
    val_mask = graph.ndata["val_mask"]

    for epoch in range(epochs):
        model.train()
        logits = model(graph, features)
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_acc = (logits[train_mask].argmax(1) == labels[train_mask]).float().mean()
        val_acc = (logits[val_mask].argmax(1) == labels[val_mask]).float().mean()

        print(
            f"Epoch {epoch+1} | Loss: {loss.item()} | Train Accuracy: {train_acc.item()} | Val Accuracy: {val_acc.item()}"
        )

In [21]:
train(model, custom_graph, 10000)

Epoch 1 | Loss: 1.954598307609558 | Train Accuracy: 0.16317734122276306 | Val Accuracy: 0.16081330180168152
Epoch 2 | Loss: 1.9205938577651978 | Train Accuracy: 0.4445812702178955 | Val Accuracy: 0.43992605805397034
Epoch 3 | Loss: 1.8902266025543213 | Train Accuracy: 0.47967979311943054 | Val Accuracy: 0.4731977880001068
Epoch 4 | Loss: 1.8538986444473267 | Train Accuracy: 0.511083722114563 | Val Accuracy: 0.5027726292610168
Epoch 5 | Loss: 1.813647985458374 | Train Accuracy: 0.546798050403595 | Val Accuracy: 0.5378928184509277
Epoch 6 | Loss: 1.771681308746338 | Train Accuracy: 0.5646551847457886 | Val Accuracy: 0.5489833354949951
Epoch 7 | Loss: 1.7285150289535522 | Train Accuracy: 0.5837438702583313 | Val Accuracy: 0.5619223713874817
Epoch 8 | Loss: 1.6844251155853271 | Train Accuracy: 0.5917487740516663 | Val Accuracy: 0.5656192302703857
Epoch 9 | Loss: 1.6398133039474487 | Train Accuracy: 0.5997536778450012 | Val Accuracy: 0.5748613476753235
Epoch 10 | Loss: 1.5950641632080078 | 

In [22]:
def evaluate(model, graph):
    model.eval()
    features = graph.ndata["feat"]
    labels = graph.ndata["label"]
    test_mask = graph.ndata["test_mask"]

    with torch.no_grad():
        logits = model(graph, features)
        test_acc = (logits[test_mask].argmax(1) == labels[test_mask]).float().mean()
        print(f"Test Accuracy: {test_acc.item()}")


evaluate(model, custom_graph)

Test Accuracy: 0.6243094205856323
