Skip to content

Commit

Permalink
PR #4 update to v0.1.1
Browse files Browse the repository at this point in the history
Fix bugs and update to v0.1.1
* fix bugs in ogb dataset support
* change data to dataset in solver and trainer
* fix AutoNE for node clf
* fix some typos
  • Loading branch information
Frozenmad committed Dec 23, 2020
2 parents 1633238 + bdad1a3 commit e560ca7
Show file tree
Hide file tree
Showing 16 changed files with 217 additions and 149 deletions.
2 changes: 1 addition & 1 deletion autogl/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.0"
__version__ = "0.1.1"
50 changes: 24 additions & 26 deletions autogl/datasets/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

Datasets are derived from CogDL
Datasets are derived from PyG, OGB and CogDL.
=================
Autograph now supports the following benchmarks for different tasks:
AutoGL now supports the following benchmarks for different tasks:
- semi-supervised node classification: Cora, Citeseer, Pubmed, Amazon Computers\*, Amazon Photo\*, Coauthor CS\*, Coauthor Physics\*, Reddit (\*: using `utils.random_splits_mask_class` for splitting dataset is recommended.)


Expand All @@ -16,19 +16,32 @@ Autograph now supports the following benchmarks for different tasks:
| Coauthor Physics || ||||| | |
| Reddit || ||||| ||


- supervised graph classification: MUTAG, IMDB-B, IMDB-M, PROTEINS, COLLAB

| Dataset | PyG | CogDL | x | y | edge_index | edge_attr | train/val/test node | train/val/test mask | adj|
| ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- |
| Mutag || ||||| | | |
| IMDB-B || | ||| | | | |
| IMDB-M || | ||| | | | |
| PROTEINS || |||| | | | |
| COLLAB || | ||| | | | |

- node classification datasets from OGB: ogbn-products, ogbn-proteins, ogbn-arxiv, ogbn-papers100M and ogbn-mag.

- graph classification datasets from OGB: ogbg-molhiv, ogbg-molpcba, ogbg-ppa and ogbg-code.

---

TODO:
Autograph now supports the following benchmarks for different tasks:
In future version, AutoGL will support the following benchmarks for different tasks:
- unsupervised node classification: PPI, Blogcatalog, Wikipedia
- semi-supervised node classification: Cora, Citeseer, Pubmed
- heterogeneous node classification: DBLP, ACM, IMDB
- link prediction: PPI, Wikipedia, Blogcatalog
- multiplex link prediction: Amazon, YouTube, Twitter
- unsupervised graph classification: MUTAG, IMDB-B, IMDB-M, PROTEINS, COLLAB
- supervised graph classification: MUTAG, IMDB-B, IMDB-M, PROTEINS, COLLAB

- link prediction datasets from OGB: ogbl-ppa, ogbl-collab, ogbl-ddi, ogbl-citation, ogbl-wikikg and ogbl-biokg.

<!--
| Dataset | PyG | CogDL | x | y | edge_index | edge_attr | train/val/test node | train/val/test mask | adj|
| ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- |
| ACM | | ✓ | ✓ | ✓ | ✓ | | ✓ | | ✓ list |
Expand All @@ -41,27 +54,12 @@ Autograph now supports the following benchmarks for different tasks:
| Amazon | | ✓ | | | | | ✓ data | | |
| Twitter | | ✓ | | | | | ✓ data | | |
| Youtube | | ✓ | | | | | ✓ data | | |
| Cora || ||||| || |
| Citeseer || ||||| || |
| Pubmed || ||||| || |
| Reddit || ||||| || |
| Mutag || ||||| | | |
| IMDB-B || | ||| | | | |
| IMDB-M || | ||| | | | |
| PROTEINS || |||| | | | |
| COLLAB || | ||| | | | |
| NCI1 | ✓ | | ✓ | ✓ | ✓ | | | | |
| NCI109 | ✓ | | ✓ | ✓ | ✓ | | | | |
| Enzyme | ✓ | | ✓ | ✓ | ✓ | | | | |
| Reddit-B | ✓ | | | ✓ | ✓ | | | | |
| Reddit-Multi-5k | ✓ | | | ✓ | ✓ | | | | |
| Reddit-Multi-12k | ✓ | | | ✓ | ✓ | | | | |
| PTC-MR | ✓ | | ✓ | ✓ | ✓ | ✓ | | | |
| NCI1 || |||| | | | |
| NCI109 || |||| | | | |
| Enzyme || |||| | | | |







-->

28 changes: 12 additions & 16 deletions autogl/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os.path as osp
import importlib
import os
import torch
from ..data.dataset import Dataset


Expand All @@ -16,7 +17,7 @@

def register_dataset(name):
"""
New dataset types can be added to autograph with the :func:`register_dataset`
New dataset types can be added to autogl with the :func:`register_dataset`
function decorator.
For example::
Expand All @@ -36,7 +37,7 @@ def register_dataset_cls(cls):
pyg and not issubclass(cls, torch_geometric.data.Dataset)
):
raise ValueError(
"Dataset ({}: {}) must extend autograph.data.Dataset".format(
"Dataset ({}: {}) must extend autogl.data.Dataset".format(
name, cls.__name__
)
)
Expand Down Expand Up @@ -105,27 +106,22 @@ def register_dataset_cls(cls):
graph_get_split,
)

"""
# automatically import any Python files in the datasets/ directory
for file in os.listdir(os.path.dirname(__file__)):
if file.endswith(".py") and not file.startswith("_"):
dataset_name = file[: file.find(".py")]
if not pyg and dataset_name.startswith("pyg"):
continue
module = importlib.import_module("autograph.datasets." + dataset_name)
"""


def build_dataset(args, path="~/.cache-autogl/"):
path = osp.join(path, "data", args.dataset)
path = os.path.expanduser(path)
return DATASET_DICT[args.dataset](path)


def build_dataset_from_name(dataset, path="~/.cache-autogl/"):
path = osp.join(path, "data", dataset)
def build_dataset_from_name(dataset_name, path="~/.cache-autogl/"):
path = osp.join(path, "data", dataset_name)
path = os.path.expanduser(path)
return DATASET_DICT[dataset](path)
dataset = DATASET_DICT[dataset_name](path)
if 'ogbn' in dataset_name:
#dataset.data, dataset.slices = dataset.collate([dataset.data])
#dataset.data.num_nodes = dataset.data.num_nodes[0]
if dataset.data.y.shape[-1] == 1:
dataset.data.y = torch.squeeze(dataset.data.y)
return dataset


__all__ = [
Expand Down
118 changes: 94 additions & 24 deletions autogl/datasets/ogb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from ogb.graphproppred import PygGraphPropPredDataset
from ogb.linkproppred import PygLinkPropPredDataset
from . import register_dataset

from .utils import index_to_mask
from torch_geometric.data import Data
# OGBN


Expand All @@ -12,64 +13,133 @@ class OGBNproductsDataset(PygNodePropPredDataset):
def __init__(self, path):
dataset = "ogbn-products"
# path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor())
PygNodePropPredDataset(name=dataset, root=path)
super(OGBNproductsDataset, self).__init__(
dataset, path, transform=T.ToSparseTensor()
dataset, path
)
# Pre-compute GCN normalization.
#adj_t = self.data.adj_t.set_diag()
#deg = adj_t.sum(dim=1).to(torch.float)
#deg_inv_sqrt = deg.pow(-0.5)
#deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
#adj_t = deg_inv_sqrt.view(-1, 1) * adj_t * deg_inv_sqrt.view(1, -1)
#self.data.adj_t = adj_t

setattr(OGBNproductsDataset, "metric", "Accuracy")
setattr(OGBNproductsDataset, "loss", "nll_loss")

split_idx = self.get_idx_split()
datalist = []
for d in self:
setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0]))
setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0]))
setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0]))
datalist.append(d)
self.data, self.slices = self.collate(datalist)

@register_dataset("ogbn-proteins")
class OGBNproteinsDataset(PygNodePropPredDataset):
def __init__(self, path):
dataset = "ogbn-proteins"
# path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor())
PygNodePropPredDataset(name=dataset, root=path)
super(OGBNproteinsDataset, self).__init__(
dataset, path, transform=T.ToSparseTensor()
dataset, path
)
dataset_t = PygNodePropPredDataset(name=dataset, root = path, transform=T.ToSparseTensor())

# Move edge features to node features.
self.data.x = dataset_t[0].adj_t.mean(dim=1)
#dataset_t[0].adj_t.set_value_(None)
del dataset_t

setattr(OGBNproteinsDataset, "metric", "ROC-AUC")
setattr(OGBNproteinsDataset, "loss", "BCEWithLogitsLoss")
setattr(OGBNproteinsDataset, "loss", "binary_cross_entropy_with_logits")
split_idx = self.get_idx_split()
datalist = []
for d in self:
setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0]))
setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0]))
setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0]))
datalist.append(d)
self.data, self.slices = self.collate(datalist)


@register_dataset("ogbn-arxiv")
class OGBNarxivDataset(PygNodePropPredDataset):
def __init__(self, path):
dataset = "ogbn-arxiv"
# path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor())
PygNodePropPredDataset(name=dataset, root=path)
super(OGBNarxivDataset, self).__init__(
dataset, path, transform=T.ToSparseTensor()
dataset, path
)

#self[0].adj_t = self[0].adj_t.to_symmetric()

setattr(OGBNarxivDataset, "metric", "Accuracy")
setattr(OGBNarxivDataset, "loss", "nll_loss")
split_idx = self.get_idx_split()

datalist = []
for d in self:
setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0]))
setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0]))
setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0]))
datalist.append(d)
self.data, self.slices = self.collate(datalist)

@register_dataset("ogbn-papers100M")
class OGBNpapers100MDataset(PygNodePropPredDataset):
def __init__(self, path):
dataset = "ogbn-papers100M"
# path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor())
PygNodePropPredDataset(name=dataset, root=path)
super(OGBNpapers100MDataset, self).__init__(
dataset, path, transform=T.ToSparseTensor()
dataset, path
)
setattr(OGBNpapers100MDataset, "metric", "Accuracy")
setattr(OGBNpapers100MDataset, "loss", "nll_loss")

split_idx = self.get_idx_split()
datalist = []
for d in self:
setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0]))
setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0]))
setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0]))
datalist.append(d)
self.data, self.slices = self.collate(datalist)

@register_dataset("ogbn-mag")
class OGBNmagDataset(PygNodePropPredDataset):
def __init__(self, path):
dataset = "ogbn-mag"
# path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor())
PygNodePropPredDataset(name=dataset, root=path)
super(OGBNmagDataset, self).__init__(
dataset, path, transform=T.ToSparseTensor()
dataset, path
)

# Preprocessing
rel_data = self[0]
# We are only interested in paper <-> paper relations.
self.data = Data(
x=rel_data.x_dict['paper'],
edge_index=rel_data.edge_index_dict[('paper', 'cites', 'paper')],
y=rel_data.y_dict['paper'])

#self.data = T.ToSparseTensor()(data)
#self[0].adj_t = self[0].adj_t.to_symmetric()

setattr(OGBNmagDataset, "metric", "Accuracy")
setattr(OGBNmagDataset, "loss", "nll_loss")
split_idx = self.get_idx_split()

datalist = []
for d in self:
setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0]))
setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0]))
setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0]))
datalist.append(d)
self.data, self.slices = self.collate(datalist)


# OGBG
Expand All @@ -83,7 +153,7 @@ def __init__(self, path):
PygGraphPropPredDataset(name=dataset, root=path)
super(OGBGmolhivDataset, self).__init__(dataset, path)
setattr(OGBGmolhivDataset, "metric", "ROC-AUC")
setattr(OGBGmolhivDataset, "loss", "BCEWithLogitsLoss")
setattr(OGBGmolhivDataset, "loss", "binary_cross_entropy_with_logits")


@register_dataset("ogbg-molpcba")
Expand All @@ -94,7 +164,7 @@ def __init__(self, path):
PygGraphPropPredDataset(name=dataset, root=path)
super(OGBGmolpcbaDataset, self).__init__(dataset, path)
setattr(OGBGmolpcbaDataset, "metric", "AP")
setattr(OGBGmolpcbaDataset, "loss", "BCEWithLogitsLoss")
setattr(OGBGmolpcbaDataset, "loss", "binary_cross_entropy_with_logits")


@register_dataset("ogbg-ppa")
Expand All @@ -105,7 +175,7 @@ def __init__(self, path):
PygGraphPropPredDataset(name=dataset, root=path)
super(OGBGppaDataset, self).__init__(dataset, path)
setattr(OGBGppaDataset, "metric", "Accuracy")
setattr(OGBGppaDataset, "loss", "CrossEntropyLoss")
setattr(OGBGppaDataset, "loss", "cross_entropy")


@register_dataset("ogbg-code")
Expand All @@ -116,7 +186,7 @@ def __init__(self, path):
PygGraphPropPredDataset(name=dataset, root=path)
super(OGBGcodeDataset, self).__init__(dataset, path)
setattr(OGBGcodeDataset, "metric", "F1 score")
setattr(OGBGcodeDataset, "loss", "CrossEntropyLoss")
setattr(OGBGcodeDataset, "loss", "cross_entropy")


# OGBL
Expand All @@ -127,7 +197,7 @@ class OGBLppaDataset(PygLinkPropPredDataset):
def __init__(self, path):
dataset = "ogbl-ppa"
# path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor())
PygLinkPropPredDataset(name=dataset, root=path)
super(OGBLppaDataset, self).__init__(dataset, path)
setattr(OGBLppaDataset, "metric", "Hits@100")
setattr(OGBLppaDataset, "loss", "pos_neg_loss")
Expand All @@ -138,7 +208,7 @@ class OGBLcollabDataset(PygLinkPropPredDataset):
def __init__(self, path):
dataset = "ogbl-collab"
# path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor())
PygLinkPropPredDataset(name=dataset, root=path)
super(OGBLcollabDataset, self).__init__(dataset, path)
setattr(OGBLcollabDataset, "metric", "Hits@50")
setattr(OGBLcollabDataset, "loss", "pos_neg_loss")
Expand All @@ -149,7 +219,7 @@ class OGBLddiDataset(PygLinkPropPredDataset):
def __init__(self, path):
dataset = "ogbl-ddi"
# path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor())
PygLinkPropPredDataset(name=dataset, root=path)
super(OGBLddiDataset, self).__init__(dataset, path)
setattr(OGBLddiDataset, "metric", "Hits@20")
setattr(OGBLddiDataset, "loss", "pos_neg_loss")
Expand All @@ -160,7 +230,7 @@ class OGBLcitationDataset(PygLinkPropPredDataset):
def __init__(self, path):
dataset = "ogbl-citation"
# path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor())
PygLinkPropPredDataset(name=dataset, root=path)
super(OGBLcitationDataset, self).__init__(dataset, path)
setattr(OGBLcitationDataset, "metric", "MRR")
setattr(OGBLcitationDataset, "loss", "pos_neg_loss")
Expand All @@ -171,7 +241,7 @@ class OGBLwikikgDataset(PygLinkPropPredDataset):
def __init__(self, path):
dataset = "ogbl-wikikg"
# path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor())
PygLinkPropPredDataset(name=dataset, root=path)
super(OGBLwikikgDataset, self).__init__(dataset, path)
setattr(OGBLwikikgDataset, "metric", "MRR")
setattr(OGBLwikikgDataset, "loss", "pos_neg_loss")
Expand All @@ -182,7 +252,7 @@ class OGBLbiokgDataset(PygLinkPropPredDataset):
def __init__(self, path):
dataset = "ogbl-biokg"
# path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor())
PygLinkPropPredDataset(name=dataset, root=path)
super(OGBLbiokgDataset, self).__init__(dataset, path)
setattr(OGBLbiokgDataset, "metric", "MRR")
setattr(OGBLbiokgDataset, "loss", "pos_neg_loss")

0 comments on commit e560ca7

Please sign in to comment.