In [None]:
import torch
from torch_geometric.datasets import MoleculeNet

import numpy as np
import pandas as pd

import py3Dmol
from rdkit import Chem

import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pio.templates.default = "plotly_white"

dataset = MoleculeNet(root='data/MoleculeNet', name='HIV')

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')
print(f'Number of edge features: {dataset.num_edge_features}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

# Visualization of the molecules

In [None]:
def draw_molecule(mol):
    m = Chem.MolToMolBlock(mol, confId=-1)

    p = py3Dmol.view(width=400, height=400)
    p.removeAllModels()

    p.addModel(m, 'sdf')
    p.setStyle({'stack': {}})
    p.setBackgroundColor('0xeeeeee')
    p.zoomTo()

    return p.show()

In [None]:
# Get a random graph from the dataset for inspection
i = 3666 # np.random.randint(len(dataset))
m = Chem.MolFromSmiles(dataset[i].smiles)
m

# Feature Engineering

In [None]:
for i in range(len(dataset)):
    x_i = dataset[i].x.cpu().detach().numpy()
    x = x_i if i ==0 else np.vstack([x,dataset[i].x.cpu().detach().numpy()])

    y_i = dataset[i].y.cpu().detach().numpy()
    y = y_i if i ==0 else np.vstack([y,dataset[i].y.cpu().detach().numpy()])

df_x = pd.DataFrame(x)

In [None]:
print(f'All the node features for the dataset: {df_x.shape}')
df_x.head()

df_x contains all the node features for the entire dataset, node feature dimension would be 9.

In [None]:
y.shape

## Node features

In [None]:
df_x.columns = [
    'atomic_num', 'chirality', 'degree', 'formal_charge',
    'numH', 'number_radical_e', 'hybridization',
    'is_aromatic', 'is_in_ring'
]

for col in df_x:
    px.histogram(
        df_x, col, histnorm='percent',
        height=300, width=500, title='Distribution of '+col).show()

## Edge features

In [None]:
for i in range(len(dataset)):
    x_i = dataset[i].edge_attr.cpu().detach().numpy()
    x = x_i if i ==0 else np.vstack([x,dataset[i].edge_attr.cpu().detach().numpy()])

df_edge = pd.DataFrame(x)
df_edge.columns = ['bond_type', 'sterio_configuration', 'is_conjugated']

In [None]:
for col in df_edge:
    px.histogram(
        df_edge, col, histnorm='percent',
        height=300, width=500, title='Distribution of '+ col).show()

## What are targets?

In [None]:
y = y.flatten()

In [None]:
df_y = pd.DataFrame({'Target': y})

In [None]:
px.histogram(
        df_y, 'Target', histnorm='percent',
        height=300, width=500, title='Distribution of '+'Target').show()

So we have an imbalanced dataset: 97% for the class 0 and 3% for the class 1.

In [None]:
df_x.head()

In [None]:
df_x['chirality'].unique()

In [None]:
cols_to_normalize = [
    'atomic_num', 'degree',
    'formal_charge',
    'numH',
    'number_radical_e'
]

cols_to_encode = [
    'chirality',
    'hybridization'
]

METHOD = 'min-max'

scalers = {}

for c in cols_to_normalize + cols_to_encode:
    if METHOD == 'normal':
        scalers[c] = {'mean':df_x[c].mean(), 'std':df_x[c].std()}
    if METHOD == 'min-max':
        scalers[c] = {'min': df_x[c].min(), 'max': df_x[c].max()}

scalers['bond_type'] = {'min': 1, 'max': 12}

In [None]:
scalers

In [None]:
dataset_new = []

for i in range(len(dataset)):
    data = dataset[i]
    x_norm = data.x.detach().cpu().numpy().astype(float)
    edge_w_norm = data.edge_attr[:,0].detach().cpu().numpy().astype(float)
    edge_a_norm = data.edge_attr[:,0].detach().cpu().numpy().astype(int)

    # normalize columns
    for c in cols_to_normalize:
        col_i = list(df_x.columns).index(c)

        if METHOD == 'normal':
            x_norm[:, col_i] = (x_norm[:, col_i] - scalers[c]['mean'])/scalers[c]['std']
        if METHOD == 'min-max':
            x_norm[:, col_i] = (x_norm[:, col_i] - scalers[c]['min'])/(scalers[c]['max'] - scalers[c]['min'])

    # one-hot encoding of categorical columns
    for i,c in enumerate(cols_to_encode):

        col = x_norm[:,list(df_x.columns).index(c)].astype(int)
        col_enc = np.zeros((col.size, scalers[c]['max']+1))
        col_enc[np.arange(col.size),col] = 1

        cols_encoded = col_enc if i == 0 else np.hstack([cols_encoded, col_enc])

    cols_i_to_encode = [list(df_x.columns).index(c) for c in cols_to_encode]
    x_norm = x_norm[:,[i for i in range(x_norm.shape[1]) if i not in cols_i_to_encode]]
    x_norm = np.hstack([x_norm, cols_encoded])

    # normalize type of bonds
    edge_w_norm = (edge_w_norm - scalers['bond_type']['min'])/(scalers['bond_type']['max'] - scalers['bond_type']['min'])

    # one-hot encoding of type of bonds
    edge_a_norm = data.edge_attr[:,0].detach().cpu().numpy().astype(int)
    col_enc = np.zeros((edge_a_norm.size, scalers['bond_type']['max']+1))
    col_enc[np.arange(edge_a_norm.size),edge_a_norm] = 1

    # saving results
    data.x_norm = torch.tensor(x_norm, dtype=torch.float)
    data.edge_w_norm = torch.tensor(edge_w_norm, dtype=torch.float)
    data.edge_a_norm = torch.tensor(col_enc, dtype=torch.float)

    dataset_new.append(data)

In [None]:
dataset_new[0]