Forked from https://www.kaggle.com/fnands/makegraphinput

introducing a validation set for use in MPNN

# **Graph creator**

A graph is a relativly natural way of representing molecules, and many method make use of structuring the data in this way.

This kernel shows a basic example of one can structure our data as a graph.   

Here we will create an array for our node values, and an adjacency matrix for our edge values. 


(**Note**: One can argue whether an adjacency matrix is really the best way to go here as we have an undirected graph, and it is therefore a bit innefficienct ( n^2 as opposed to n(n-1)/2 ), but it is easy to work with)

This is by no means the fastest way of doing this, but it is straightforward and only has to be run once, and the output can then be used. 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.spatial import distance_matrix
from tqdm import tqdm_notebook
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os
print(os.listdir("../input"))

datadir = "../input/"

## Read in train, test and structures files

In [None]:
train = pd.read_csv(datadir + 'champs-scalar-coupling/train.csv')
test = pd.read_csv(datadir + 'champs-scalar-coupling/test.csv')
structures = pd.read_csv(datadir + 'champs-scalar-coupling/structures.csv')

## Read in bonds files
Taken from:   https://www.kaggle.com/asauve/dataset-with-number-of-bonds-between-atoms  
(thanks Alexandre Sauvé!)

In [None]:
train_bonds = pd.read_csv(datadir + 'predicting-molecular-properties-bonds/train_bonds.csv')
test_bonds = pd.read_csv(datadir + 'predicting-molecular-properties-bonds/test_bonds.csv')

In [None]:
train_bonds.head()

## Reads from angles file
Taken from: https://www.kaggle.com/soerendip/calculate-angles-and-dihedrals-with-networkx
(thanks Rakete!)

In [None]:
angs = pd.read_csv(datadir + "angle-and-dihedral-for-the-champs-structures/angles.csv")


In [None]:
angs.head()

## Normalize targets so they have are centered around 0 and have max of 1, and one-hot encode coupling types

In [None]:
#why not standard scaler??
#scale_min  = train['scalar_coupling_constant'].min()
#scale_max  = train['scalar_coupling_constant'].max()
#scale_mid = (scale_max + scale_min)/2
#scale_norm = scale_max - scale_mid

#train['scalar_coupling_constant'] = (train['scalar_coupling_constant'] - scale_mid)/scale_norm
train['scalar_coupling_constant']=(train['scalar_coupling_constant']-train['scalar_coupling_constant'].min())/(train['scalar_coupling_constant'].max()-train['scalar_coupling_constant'].min())

# One hot encoding gets  too big for Kaggle, let's try label
# use npz now, back to OH
train[['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']] =  pd.get_dummies(train['type'])
test[['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']]  =  pd.get_dummies(test['type'])

#le = preprocessing.LabelEncoder()
#le.fit(['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN'])
#train['l_type'] = (le.transform(train['type']) + 1)/8.
#test['l_type'] = (le.transform(test['type']) + 1)/8.

In [None]:
train['scalar_coupling_constant'].hist()

## Pre-process the structures by one-hot encoding the atom types, and normalize distances to have around max of 1

In [None]:
structures[['C', 'F' ,'H', 'N', 'O']] = pd.get_dummies(structures['atom'])
#why not standard scaler??
#normalized_df=(df-df.min())/(df.max()-df.min())
#structures[['x', 'y', 'z']] = structures[['x', 'y', 'z']]/10.
structures[['x', 'y', 'z']]=(structures[['x', 'y', 'z']]-structures[['x', 'y', 'z']].min())/(structures[['x', 'y', 'z']].max()-structures[['x', 'y', 'z']].min())
nuclear_charge = {'H':1.0, 'C':6.0, 'N':7.0, 'O':8.8, 'F':9.0}
structures['nuclear_charge'] = [nuclear_charge[x] for x in structures['atom'].values]
structures['nuclear_charge'] = structures['nuclear_charge'] / 9.0
#structures['nuclear_charge']=(structures['nuclear_charge']-structures['nuclear_charge'].min())/(structures['nuclear_charge'].max()-structures['nuclear_charge'].min())


In [None]:
structures[['x', 'y', 'z']].hist()

## Process bonds

In [None]:
test_bonds[['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']] = pd.get_dummies(test_bonds['nbond'])#test_bonds['nbond']/3
train_bonds[['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']] = pd.get_dummies(train_bonds['nbond'])#train_bonds['nbond']/3


# Process angles

In [None]:
angs['dihedral'] = angs['dihedral']/np.pi
# Should I rather one-hot this?
angs['shortest_path_n_bonds'] = angs['shortest_path_n_bonds']/6.0
angs = angs.fillna(0)

## Find training and testing molecules, and split structrues into test and train. Then group by molecule


In [None]:
train_mol_names = train['molecule_name'].unique()
train_mol_names, valid_mol_names = train_test_split(train_mol_names, test_size=0.2, random_state=42)
print(train_mol_names.shape)
print(valid_mol_names.shape)

valid = train.loc[train['molecule_name'].isin(valid_mol_names)]
train = train.loc[train['molecule_name'].isin(train_mol_names)]

valid_bonds = train_bonds.loc[train_bonds['molecule_name'].isin(valid_mol_names)]
train_bonds = train_bonds.loc[train_bonds['molecule_name'].isin(train_mol_names)]

print(train.shape)
print(valid.shape)

test_mol_names  = test['molecule_name'].unique()

train_structures = structures.loc[structures['molecule_name'].isin(train_mol_names)]
valid_structures = structures.loc[structures['molecule_name'].isin(valid_mol_names)]
test_structures = structures.loc[structures['molecule_name'].isin(test_mol_names)]

train_struct_group = train_structures.groupby('molecule_name')
valid_struct_group = valid_structures.groupby('molecule_name')
test_struct_group  = test_structures.groupby('molecule_name')

train_group = train.groupby('molecule_name')
valid_group = valid.groupby('molecule_name')
test_group  = test.groupby('molecule_name')

train_bond_group = train_bonds.groupby('molecule_name')
valid_bond_group = valid_bonds.groupby('molecule_name')
test_bond_group  = test_bonds.groupby('molecule_name')

train_angs = angs.loc[angs['molecule_name'].isin(train_mol_names)]
valid_angs = angs.loc[angs['molecule_name'].isin(valid_mol_names)]
test_angs = angs.loc[angs['molecule_name'].isin(test_mol_names)]

train_angs_group = train_angs.groupby('molecule_name')
valid_angs_group = valid_angs.groupby('molecule_name')
test_angs_group  = test_angs.groupby('molecule_name')

# Find max nodes in graph:
max_size = train_struct_group.size().max()

## Define node and edge values

In [None]:
# Values our nodes will have
node_vals = ['C', 'F' ,'H', 'N', 'O','nuclear_charge'] 
#Values our edges will have (minus distance, for now)
bond_vals = ['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']
j_coup_vals = ['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']
#j_coup_vals = ['2JHC', '2JHN','2JHH']
ang_vals = ['shortest_path_n_bonds','cosinus','dihedral']
edge_vals = j_coup_vals + bond_vals + ang_vals

# Find amount of training molecules
n_train_mols = len(train_mol_names)
n_valid_mols = len(valid_mol_names)
n_test_mols = len(test_mol_names)

# Find dim of edges and nodes
bond_dim  = len(bond_vals)
j_coup_dim= len(j_coup_vals)
ang_dim   = len(ang_vals)
node_dim  = len(node_vals)
edge_dim  = len(edge_vals) 

# Additional edge dims for distances 
add_edge_dim = 1

print(node_dim)
print(bond_dim)
print(ang_dim)
print(edge_dim)
print(j_coup_dim)
print(j_coup_vals)

## Pre-allocate arrays that we will fill later


In [None]:
train_nodes_array     = np.zeros((n_train_mols, max_size, node_dim), dtype=np.float32) 
train_in_edges_array  = np.zeros((n_train_mols, max_size, max_size, edge_dim + add_edge_dim),dtype=np.float32) 
train_out_edges_array = np.zeros((n_train_mols, max_size, max_size, 1),dtype=np.float32)

valid_nodes_array     = np.zeros((n_valid_mols, max_size, node_dim), dtype=np.float32) 
valid_in_edges_array  = np.zeros((n_valid_mols, max_size, max_size, edge_dim + add_edge_dim),dtype=np.float32) 
valid_out_edges_array = np.zeros((n_valid_mols, max_size, max_size, 1),dtype=np.float32)

test_nodes_array     = np.zeros((n_test_mols, max_size, node_dim), dtype=np.float32) 
test_in_edges_array  = np.zeros((n_test_mols, max_size, max_size, edge_dim + add_edge_dim),dtype=np.float32) 
print(len(valid_group))

In [None]:
#how can we parallize this?
def make_arrs(val_group, struct_group, bond_group, ang_group, mode='train'):
    debug=False
    i = 0
    maxit = 2
    for values, structs, bonds, angles in tqdm_notebook(zip(val_group, struct_group, bond_group, ang_group),total=len(val_group)):
        if i>maxit and debug: break
        # Calculate distances
        distances = np.zeros((max_size, max_size, add_edge_dim))
        coords = structs[1][['x','y','z']].values
        
        dists  = distance_matrix(coords, coords)
        #can we bin distances here?
        
        distances[:dists.shape[0],:dists.shape[1], 0] = dists
        
        # Create nodes
        if debug:
            print(structs)
            print(structs[1])
        mol_info = structs[1][node_vals].values
        nodes = np.zeros((max_size, node_dim))
        nodes[:mol_info.shape[0], :mol_info.shape[1]] = mol_info

        # Create edges
        # in_feats is type descriptos one_hot_encoded -> use it to filter on type later on
        in_feats = np.zeros((max_size, max_size, j_coup_dim))
        ind = values[1][['atom_index_0', 'atom_index_1' ]].values
        in_feats[ind[:,0], ind[:,1], 0:j_coup_dim] = values[1][j_coup_vals].values
        in_feats[ind[:,1], ind[:,0], 0:j_coup_dim] = in_feats[ind[:,0], ind[:,1], 0:j_coup_dim]
                  
        # Create bonds
        in_bonds = np.zeros((max_size, max_size, bond_dim))
        ind_bonds = bonds[1][['atom_index_0', 'atom_index_1' ]].values
        in_bonds[ind_bonds[:,0], ind_bonds[:,1]] = bonds[1][bond_vals].values
        in_bonds[ind_bonds[:,1], ind_bonds[:,0]] = in_bonds[ind_bonds[:,0], ind_bonds[:,1]]
        
        # Create angles
        ind_angs = angles[1][['atom_index_0', 'atom_index_1' ]].values
        ang_mat  = np.zeros((max_size, max_size, ang_dim))
        ang_mat[ind_angs[:,0], ind_angs[:,1]]  = angles[1][ang_vals]
        ang_mat[ind_angs[:,1], ind_angs[:,0]]  = ang_mat[ind_angs[:,0], ind_angs[:,1]]
        
        # concat all edge values
        if debug:
            print("edges:")
            print(in_feats.shape)
            print(in_bonds.shape)
            print(ang_mat.shape)
            print(distances.shape)
        in_edges = np.concatenate((in_feats, in_bonds, ang_mat, distances),axis=2)
  
        if not mode=='test':           
            out_edges = np.zeros((max_size, max_size, 1))
            
            # set irrelevant coupling values to zero
            idx = values[1]['type'].isin(j_coup_vals)
            values[1]['scalar_coupling_constant'].loc[~idx] = 0.0
            
            out_edges[ind[:,0], ind[:,1], 0] = values[1]['scalar_coupling_constant'].values
            out_edges[ind[:,1], ind[:,0], 0] = out_edges[ind[:,0], ind[:,1], 0]
            if debug:
                print(idx)
                print(values[1])
                print(out_edges.shape)
                print(out_edges)
                input()
            if mode == 'train':
                train_nodes_array[i]      = nodes
                train_in_edges_array[i]   = in_edges
                train_out_edges_array[i]  = out_edges
                
            if mode == 'valid':
                valid_nodes_array[i]      = nodes
                valid_in_edges_array[i]   = in_edges
                valid_out_edges_array[i]  = out_edges
        else:
            test_nodes_array[i]      = nodes
            test_in_edges_array[i]   = in_edges
        i = i + 1


In [None]:
make_arrs(train_group, train_struct_group, train_bond_group, train_angs_group, mode = 'train')

In [None]:
#check distances
print(train_in_edges_array.shape)
#distance matrix first molecule
print(train_in_edges_array[0,:,:,15])

In [None]:
make_arrs(valid_group, valid_struct_group, valid_bond_group, valid_angs_group, mode = 'valid')

In [None]:
make_arrs(test_group, test_struct_group, test_bond_group, test_angs_group, mode = 'test')

## Save as numpy arrays

In [None]:
np.savez_compressed("train_mol_names.npz" , train_mol_names)
np.savez_compressed("valid_mol_names.npz" , valid_mol_names)

np.savez_compressed("nodes_train.npz" , train_nodes_array)
np.savez_compressed("in_edges_train.npz" , train_in_edges_array)
np.savez_compressed("out_edges_train.npz" , train_out_edges_array)

np.savez_compressed("nodes_valid.npz" , valid_nodes_array)
np.savez_compressed("in_edges_valid.npz" , valid_in_edges_array)
np.savez_compressed("out_edges_valid.npz" , valid_out_edges_array)

np.savez_compressed("nodes_test.npz" , test_nodes_array)
np.savez_compressed("in_edges_test.npz" , test_in_edges_array)