In [1]:
import numpy as np
import pandas as pd

## Creating a Dataset for Node Classification or Link Prediction from CSV


#### 数据集只有一个图

In [34]:
members = pd.read_csv("./members.csv")
interations = pd.read_csv("./interactions.csv")

In [8]:
print(interations.head())
print(members.head())

   Src  Dst    Weight
0    0    1  0.043591
1    0    2  0.282119
2    0    3  0.370293
3    0    4  0.730570
4    0    5  0.821187
   Id    Club  Age
0   0  Mr. Hi   44
1   1  Mr. Hi   37
2   2  Mr. Hi   37
3   3  Mr. Hi   40
4   4  Mr. Hi   30


In [35]:
members["Club"].unique()

array(['Mr. Hi', 'Officer'], dtype=object)

In [36]:
# club 映射为0，1
members["Club"] = members["Club"].astype("category").cat.codes.to_numpy()

In [37]:
members["Club"].unique()

array([0, 1], dtype=int8)

Your custom graph dataset should inherit the dgl.data.DGLDataset class and implement the following methods:

\__getitem\__(self, i): retrieve the i-th example of the dataset. An example often contains a single DGL graph, and occasionally its label.

\__len\__(self): the number of examples in the dataset.

process(self): load and process raw data from disk.

In [2]:
import dgl
import torch
from dgl.data import DGLDataset


class KarateClubDataset(DGLDataset):
    def __init__(self):
        super().__init__(name="karate_club")

    def process(self):
        # sample 来打乱
        nodes_data = pd.read_csv("./members.csv").sample(frac=1)
        edges_data = pd.read_csv("./interactions.csv").sample(frac=1)

        # minmax归一化年龄
        age = nodes_data["Age"].to_numpy()
        age = (age - age.min()) / (age.max() - age.min())

        # club化为01
        club = (nodes_data["Club"]).astype("category").cat.codes.to_numpy()

        node_labels = torch.from_numpy(club)
        node_features = torch.from_numpy(age)

        edges_src = torch.from_numpy(edges_data["Src"].to_numpy())
        edges_dst = torch.from_numpy(edges_data["Dst"].to_numpy())

        num_nodes = len(nodes_data)

        self.graph = dgl.graph((edges_src, edges_dst), num_nodes=num_nodes)
        self.graph.ndata["feat"] = node_features
        self.graph.ndata["label"] = node_labels
        self.graph.edata["weight"] = torch.from_numpy(edges_data["Weight"].to_numpy())

        # If your dataset is a node classification dataset, you will need to assign
        # masks indicating whether a node belongs to training, validation, and test set.

        n_train = int(num_nodes * 0.6)
        n_val = int(num_nodes * 0.2)
        n_test = num_nodes - n_train - n_val

        train_mask = torch.zeros(num_nodes, dtype=torch.bool)
        val_mask = torch.zeros(num_nodes, dtype=torch.bool)
        test_mask = torch.zeros(num_nodes, dtype=torch.bool)

        train_mask[:n_train] = True
        val_mask[n_train : n_train + n_val] = True
        test_mask[n_train + n_val :] = True

    def __getitem__(self, i):
        return self.graph

    def __len__(self):
        return 1

In [231]:
n_train = int(num_nodes * 0.6)
n_val = int(num_nodes * 0.2)
n_test = num_nodes - n_train - n_val

KarateClubDataset()

Dataset("karate_club", num_graphs=1, save_path=C:\Users\mi\.dgl\karate_club)

In [230]:
karate_dataset = KarateClubDataset()[0]

Dataset("karate_club", num_graphs=1, save_path=C:\Users\mi\.dgl\karate_club)

In [217]:
karate_dataset.ndata.keys()

dict_keys(['feat', 'label'])

## Creating a Dataset for Graph Classification from CSV

### graph_edges.csv: containing three columns:

graph_id: the ID of the graph.

src: the source node of an edge of the given graph.

dst: the destination node of an edge of the given graph.

### graph_properties.csv: containing three columns:

graph_id: the ID of the graph.

label: the label of the graph.

num_nodes: the number of nodes in the graph.

In [220]:
import urllib.request

In [221]:
# urllib.request.urlretrieve(
#     "https://data.dgl.ai/tutorial/dataset/graph_edges.csv", "./graph_edges.csv"
# )
# urllib.request.urlretrieve(
#     "https://data.dgl.ai/tutorial/dataset/graph_properties.csv",
#     "./graph_properties.csv",
# )

('./graph_properties.csv', <http.client.HTTPMessage at 0x22f2b5ef430>)

In [222]:
edges = pd.read_csv("./graph_edges.csv")
properties = pd.read_csv("./graph_properties.csv")

In [226]:
properties.head()

Unnamed: 0,graph_id,label,num_nodes
0,0,0,15
1,1,0,10
2,2,0,13
3,3,0,13
4,4,0,17


In [227]:
edges.head()

Unnamed: 0,graph_id,src,dst
0,0,0,1
1,0,0,14
2,0,1,0
3,0,1,2
4,0,2,1


In [86]:
class SyntheticDataset(DGLDataset):
    def __init__(self):
        super().__init__(name="synthetic")

    def process(self):
        edges = pd.read_csv("./graph_edges.csv")
        properties = pd.read_csv("./graph_properties.csv")
        self.graphs = []
        self.labels = []

        # Create a graph for each graph ID from the edges table.
        # First process the properties table into two dictionaries with graph IDs as keys.
        # The label and number of nodes are values.
        label_dict = {}
        num_nodes_dict = {}

        for i, row in properties.iterrows():
            label_dict[row["graph_id"]] = row["label"]
            num_nodes_dict[row["graph_id"]] = row["num_nodes"]

        groups = edges.groupby("graph_id")
        for graph_id, g in groups:
            u = torch.from_numpy(g["src"].to_numpy())
            v = torch.from_numpy(g["dst"].to_numpy())

            label = label_dict[graph_id]
            num_nodes = num_nodes_dict[graph_id]

            g = dgl.graph((u, v), num_nodes=num_nodes)
            self.graphs.append(g)
            self.labels.append(label)

        self.labels = torch.tensor(self.labels, dtype=torch.int8)

    def __getitem__(self, i):
        return self.graphs[i], self.labels[i]

    def __len__(self):
        return len(self.labels)

In [87]:
synthetic_datset = SyntheticDataset()

In [88]:
graph, label = synthetic_datset[0]

In [89]:
label

tensor(0, dtype=torch.int8)