In [1]:
from pathlib import Path
from src.datasets import AddNodeDegree

root = Path.cwd()
project_dir = root.resolve().parent
datasets = project_dir / "datasets"

In [3]:
from torch_geometric.loader import DataLoader
from torch_geometric.datasets import QM9


dataset = QM9(root=datasets / 'test' / 'QM9D',pre_transform=AddNodeDegree())
loader =DataLoader(dataset, batch_size=3, shuffle=False)
n = 1
for batch in loader:
    if n > 0:
        n -= 1
    else:
        break
    # batch.x: [N_total, num_features]
    # batch.batch: [N_total] where batch[i]=g means node i belongs to graph g (0 ≤ g < batch_size)
    print(batch.x.shape, batch.batch.shape)
    # e.g. torch.Size([35, 11]) torch.Size([35])
print(len(dataset))
print(dataset[0])
d = dataset[0]
d2 = dataset[4]
d.smiles()

Processing...
100%|██████████| 133885/133885 [00:33<00:00, 3952.11it/s]
Done!


torch.Size([12, 12]) torch.Size([12])
130831
Data(x=[5, 12], edge_index=[2, 8], edge_attr=[8, 4], y=[1, 19], pos=[5, 3], z=[5], smiles='[H]C([H])([H])[H]', name='gdb_1', idx=[1])


TypeError: 'str' object is not callable

In [None]:
from collections import defaultdict

# Utility to gather unique values per feature index
def gather_unique_values(dataset, attr_name, feature_dim):
    uniques = defaultdict(set)
    for data in dataset:
        feat = getattr(data, attr_name)
        # Handle 1D or 2D feature arrays
        if feat is None:
            continue
        feat = feat.detach()
        if feat.dim() == 2:
            for i in range(feat.size(1)):
                uniques[i].update(feat[:, i].unique().tolist())
        elif feat.dim() == 1 and feature_dim is None:
            uniques[0].update(feat.unique().tolist())
        else:
            # treat entire tensor as one feature (e.g., for y)
            uniques[0].update(feat.view(-1).unique().tolist())
    return uniques

# Node features: data.x has shape [total_nodes, 11]
node_uniques = gather_unique_values(dataset, 'x', feature_dim=dataset.num_node_features)
print("Node feature categories per index:")
for idx, vals in sorted(node_uniques.items()):
    print(f"  - Feature {idx}: {len(vals)} distinct values → {sorted(vals)}")

# Edge features: data.edge_attr has shape [total_edges, 4]
edge_uniques = gather_unique_values(dataset, 'edge_attr', feature_dim=dataset.num_edge_features)
print("\nEdge feature categories per index:")
for idx, vals in sorted(edge_uniques.items()):
    print(f"  - Feature {idx}: {len(vals)} distinct values → {sorted(vals)}")

# Graph‐level targets: data.y has shape [1, 19]
# Here we treat each of the 19 targets as one "feature column":
graph_uniques = defaultdict(set)
for data in dataset:
    y = data.y.squeeze()  # shape [19]
    for i, val in enumerate(y.tolist()):
        graph_uniques[i].add(val)
print("\nGraph‐level target categories per index:")
for idx, vals in sorted(graph_uniques.items()):
    print(f"  - Target {idx}: {len(vals)} distinct values")  # continuous, so this will be size = num_graphs

# Additionally, checking atomic numbers (data.z) and hybridization categories:
z_uniques = gather_unique_values(dataset, 'z', feature_dim=None)
print("\nAtomic number categories:", sorted(z_uniques[0]))