In [None]:
import torch
from torch_geometric.datasets import MoleculeNet

import numpy as np
import pandas as pd

import py3Dmol
from rdkit import Chem

import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pio.templates.default = "plotly_white"

dataset = MoleculeNet(root='data/MoleculeNet', name='BBBP')

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')
print(f'Number of edge features: {dataset.num_edge_features}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

In [None]:
# Get a random graph from the dataset for inspection
i = 366 # np.random.randint(len(dataset))
m = Chem.MolFromSmiles(dataset[i].smiles)
m

# Node features

In [None]:
for i in range(len(dataset)):
    x_i = dataset[i].x.cpu().detach().numpy()
    x = x_i if i ==0 else np.vstack([x,dataset[i].x.cpu().detach().numpy()])

    y_i = dataset[i].y.cpu().detach().numpy()
    y = y_i if i ==0 else np.vstack([y,dataset[i].y.cpu().detach().numpy()])

df_x = pd.DataFrame(x)
df_x.columns = [
    'atomic_num', 'chirality', 'degree', 'formal_charge',
    'numH', 'number_radical_e', 'hybridization',
    'is_aromatic', 'is_in_ring'
]

print(f'All the node features for the dataset: {df_x.shape}')
print('Target class: ', y.shape)

In [None]:
df_x.head()

In [None]:
for col in df_x:
    px.histogram(
        df_x, col, histnorm='percent',
        height=300, width=500, title='Distribution of '+col).show()

# Edge features

In [None]:
for i in range(len(dataset)):
    x_i = dataset[i].edge_attr.cpu().detach().numpy()
    x = x_i if i ==0 else np.vstack([x,dataset[i].edge_attr.cpu().detach().numpy()])

df_edge = pd.DataFrame(x)
df_edge.columns = ['bond_type', 'sterio_configuration', 'is_conjugated']

In [None]:
for col in df_edge:
    px.histogram(
        df_edge, col, histnorm='percent',
        height=300, width=500, title='Distribution of '+ col).show()

In [None]:
y_flat = y.flatten()
df_y = pd.DataFrame({'Target': y_flat})
df_y.head()

In [None]:
px.histogram(
        df_y, 'Target', histnorm='percent',
        height=300, width=500, title='Distribution of '+'Target').show()

Looks like the dataset is highly imbalanced!

In [None]:
x = df_y.value_counts()
x = list(x)
max(x)

In [None]:
len(y_flat)

In [None]:
np.bincount(y_flat.astype(np.int32))

In [None]:
len(y_flat) / (2 * np.bincount(y_flat.astype(np.int32)))

In [None]:
2050 / (2 * 483)