## DGL Tutorial Sandbox


In [5]:
import torch.nn.functional as F
import torch.nn as nn
import torch
import dgl.data
import dgl
import os

os.environ["DGLBACKEND"] = "pytorch"

In [6]:
dataset = dgl.data.CoraGraphDataset()

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


In [7]:
g = dataset[0]  # a dgl dataset object may contain one or multiple graphs
print(g.ndata)  # node features
print(g.edata)  # edge features
print(g.ndata['train_mask'])  # the training set mask
print(g.ndata['val_mask'])  # the validation set mask
print(g.ndata['test_mask'])  # the test set mask
print(g.ndata['label'])  # the ground truth labels
print(g.ndata['feat'])  # the node features

{'feat': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'label': tensor([3, 4, 4,  ..., 3, 3, 3]), 'val_mask': tensor([False, False, False,  ..., False, False, False]), 'test_mask': tensor([False, False, False,  ...,  True,  True,  True]), 'train_mask': tensor([ True,  True,  True,  ..., False, False, False])}
{}
tensor([ True,  True,  True,  ..., False, False, False])
tensor([False, False, False,  ..., False, False, False])
tensor([False, False, False,  ...,  True,  True,  True])
tensor([3, 4, 4,  ..., 3, 3, 3])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


In [8]:
g.ndata['feat'].shape
print(g.number_of_nodes())
print(g.number_of_edges())
print(g.edges())
print(g.nodes().shape)

2708
10556
(tensor([   0,    0,    0,  ..., 2707, 2707, 2707]), tensor([ 633, 1862, 2582,  ...,  598, 1473, 2706]))
torch.Size([2708])


In [9]:
from dgl.nn import GraphConv


class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h


def train(g, model):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    best_val_acc = 0
    best_test_acc = 0

    features = g.ndata['feat']
    labels = g.ndata['label']
    train_mask = g.ndata['train_mask']
    val_mask = g.ndata['val_mask']
    test_mask = g.ndata['test_mask']

    for e in range(10):
        # forward
        logits = model(g, features)

        # compute prediction
        pred = logits.argmax(1)

        # compute loss; Note that we use the mask to filter out invalid entries in the label tensor
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])

        # compute accuracy on training/validation/test
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
        test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

        # save the best validation accuracy and the corresponding test accuracy.
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(
            'In epoch {}, loss: {:.3f}, val acc: {:.3f} (best {:.3f}), test acc: {:.3f} (best {:.3f})'.format(
                e, loss, val_acc, best_val_acc, test_acc, best_test_acc
            )
        )


model = GCN(
    g.ndata['feat'].shape[1], 16, dataset.num_classes
)  # input feature size, hidden size, output feature size
train(g, model)

In epoch 0, loss: 1.946, val acc: 0.128 (best 0.128), test acc: 0.100 (best 0.100)
In epoch 1, loss: 1.941, val acc: 0.176 (best 0.176), test acc: 0.160 (best 0.160)
In epoch 2, loss: 1.932, val acc: 0.228 (best 0.228), test acc: 0.245 (best 0.245)
In epoch 3, loss: 1.923, val acc: 0.410 (best 0.410), test acc: 0.411 (best 0.411)
In epoch 4, loss: 1.914, val acc: 0.456 (best 0.456), test acc: 0.467 (best 0.467)
In epoch 5, loss: 1.902, val acc: 0.534 (best 0.534), test acc: 0.539 (best 0.539)
In epoch 6, loss: 1.889, val acc: 0.602 (best 0.602), test acc: 0.605 (best 0.605)
In epoch 7, loss: 1.875, val acc: 0.628 (best 0.628), test acc: 0.625 (best 0.625)
In epoch 8, loss: 1.861, val acc: 0.628 (best 0.628), test acc: 0.630 (best 0.625)
In epoch 9, loss: 1.845, val acc: 0.640 (best 0.640), test acc: 0.649 (best 0.649)


## Build the dataset


In [10]:
import urllib.request

import pandas as pd

urllib.request.urlretrieve(
    "https://data.dgl.ai/tutorial/dataset/members.csv", "./members.csv"
)
urllib.request.urlretrieve(
    "https://data.dgl.ai/tutorial/dataset/interactions.csv",
    "./interactions.csv",
)

('./interactions.csv', <http.client.HTTPMessage at 0x7fbce10aba60>)

In [11]:
members = pd.read_csv("./members.csv")
members.head()

Unnamed: 0,Id,Club,Age
0,0,Mr. Hi,44
1,1,Mr. Hi,37
2,2,Mr. Hi,37
3,3,Mr. Hi,40
4,4,Mr. Hi,30


In [12]:
interactions = pd.read_csv("./interactions.csv")
interactions.head()

Unnamed: 0,Src,Dst,Weight
0,0,1,0.043591
1,0,2,0.282119
2,0,3,0.370293
3,0,4,0.73057
4,0,5,0.821187


In [13]:
from dgl.data import DGLDataset
import torch
import dgl
import os

os.environ["DGLBACKEND"] = "pytorch"

In [14]:
# treats the members as nodes and interactions as edges.
# It takes age as a numerical feature of the nodes,
# affiliated club as the label of the nodes, and
# edge weight as a numerical feature of the edges.


class KarateClubDataset(DGLDataset):
    def __init__(self):
        super().__init__(name="karate_club")

    def process(self):
        nodes_data = pd.read_csv("./members.csv")
        edges_data = pd.read_csv("./interactions.csv")
        node_features = torch.from_numpy(nodes_data["Age"].to_numpy())
        node_labels = torch.from_numpy(
            nodes_data["Club"].astype("category").cat.codes.to_numpy()
        )

        # save the mapping rule from label id to raw labels
        mapping_rule = dict(
            zip(nodes_data["Club"].astype("category").cat.codes, nodes_data["Club"])
        )
        torch.save(mapping_rule, "mapping_rule.pt")

        edge_features = torch.from_numpy(edges_data["Weight"].to_numpy())
        edges_src = torch.from_numpy(edges_data["Src"].to_numpy())
        edges_dst = torch.from_numpy(edges_data["Dst"].to_numpy())

        self.graph = dgl.graph((edges_src, edges_dst), num_nodes=nodes_data.shape[0])
        self.graph.ndata["feat"] = node_features
        self.graph.ndata["label"] = node_labels
        self.graph.edata["weight"] = edge_features

        # assign masks indicating whether a node/edge belongs to training/validation/test set.
        n_nodes = nodes_data.shape[0]
        n_train = int(n_nodes * 0.6)
        n_val = int(n_nodes * 0.2)
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        val_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        train_mask[:n_train] = True
        val_mask[n_train : n_train + n_val] = True
        test_mask[n_train + n_val :] = True
        self.graph.ndata["train_mask"] = train_mask
        self.graph.ndata["val_mask"] = val_mask
        self.graph.ndata["test_mask"] = test_mask

    def __getitem__(self, i):
        return self.graph

    def __len__(self):
        return 1


dataset = KarateClubDataset()
graph = dataset[0]

  node_labels = torch.from_numpy(


In [15]:
mapping_dict = torch.load("mapping_rule.pt")
print(mapping_dict)

{0: 'Mr. Hi', 1: 'Officer'}


In [16]:
graph.ndata['label']

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=torch.int8)

In [17]:
mapping_dict[0]

'Mr. Hi'

In [18]:
labels = [mapping_dict[int(label)] for label in graph.ndata['label']]

In [19]:
labels

['Mr. Hi',
 'Mr. Hi',
 'Mr. Hi',
 'Mr. Hi',
 'Mr. Hi',
 'Mr. Hi',
 'Mr. Hi',
 'Mr. Hi',
 'Mr. Hi',
 'Officer',
 'Mr. Hi',
 'Mr. Hi',
 'Mr. Hi',
 'Mr. Hi',
 'Officer',
 'Officer',
 'Mr. Hi',
 'Mr. Hi',
 'Officer',
 'Mr. Hi',
 'Officer',
 'Mr. Hi',
 'Officer',
 'Officer',
 'Officer',
 'Officer',
 'Officer',
 'Officer',
 'Officer',
 'Officer',
 'Officer',
 'Officer',
 'Officer',
 'Officer']

In [20]:
g = dataset[0]
g

Graph(num_nodes=34, num_edges=156,
      ndata_schemes={'feat': Scheme(shape=(), dtype=torch.int64), 'label': Scheme(shape=(), dtype=torch.int8), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.float64)})

In [28]:
from torch_geometric.data import Data

dataset = Data(
    x=g.ndata['feat'],
    edge_index=torch.stack(g.edges()),
    y=g.ndata['label'],
    train_mask=g.ndata['train_mask'],
    val_mask=g.ndata['val_mask'],
    test_mask=g.ndata['test_mask'],
)

print(dataset)
print(dataset.edge_index)

Data(x=[34], edge_index=[2, 156], y=[34], train_mask=[34], val_mask=[34], test_mask=[34])
tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,
          3,  3,  3,  3,  3,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,
          7,  7,  8,  8,  8,  8,  8,  9,  9, 10, 10, 10, 11, 12, 12, 13, 13, 13,
         13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 20, 20, 21,
         21, 22, 22, 23, 23, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 27, 27,
         27, 27, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31,
         31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
         33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33],
        [ 1,  2,  3,  4,  5,  6,  7,  8, 10, 11, 12, 13, 17, 19, 21, 31,  0,  2,
          3,  7, 13, 17, 19, 21, 30,  0,  1,  3,  7,  8,  9, 13, 27, 28, 32,  0,
          1,  2,  7, 12, 13,  0,  6, 10,  

## Prepare Dataset


In [23]:
from dgl.data import DGLDataset


class MyDataset(DGLDataset):
    """Template for customizing graph datasets in DGL.

    Parameters
    ----------
    url : str
        URL to download the raw dataset
    raw_dir : str
        Specifying the directory that will store the
        downloaded data or the directory that
        already stores the input data.
        Default: ~/.dgl/
    save_dir : str
        Directory to save the processed dataset.
        Default: the value of `raw_dir`
    force_reload : bool
        Whether to reload the dataset. Default: False
    verbose : bool
        Whether to print out progress information
    """

    def __init__(
        self, url=None, raw_dir=None, save_dir=None, force_reload=False, verbose=False
    ):
        super(MyDataset, self).__init__(
            name='dataset_name',
            url=url,
            raw_dir=raw_dir,
            save_dir=save_dir,
            force_reload=force_reload,
            verbose=verbose,
        )

    def download(self):
        # download raw data to local disk
        pass

    def process(self):
        # process raw data to graphs, labels, splitting masks
        pass

    def __getitem__(self, idx):
        # get one example by index
        pass

    def __len__(self):
        # number of data examples
        pass

    def save(self):
        # save processed data to directory `self.save_path`
        pass

    def load(self):
        # load processed data from directory `self.save_path`
        pass

    def has_cache(self):
        # check whether there are processed data in `self.save_path`
        pass