# **Graph creator**

A graph is a relativly natural way of representing molecules, and many method make use of structuring the data in this way.

This kernel shows a basic example of one can structure our data as a graph.   

Here we will create an array for our node values, and an adjacency matrix for our edge values. 


(**Note**: One can argue whether an adjacency matrix is really the best way to go here as we have an undirected graph, and it is therefore a bit innefficienct ( n^2 as opposed to n(n-1)/2 ), but it is easy to work with)

This is by no means the fastest way of doing this, but it is straightforward and only has to be run once, and the output can then be used. 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.spatial import distance_matrix

from sklearn import preprocessing
import os
print(os.listdir("../input/"))

datadir = "../input/"

## Read in train, test and structures files

In [None]:
train = pd.read_csv(datadir + 'champs-scalar-coupling/train.csv')
test = pd.read_csv(datadir + 'champs-scalar-coupling/test.csv')
structures = pd.read_csv(datadir + 'champs-scalar-coupling/structures.csv')

In [None]:
train.head()

In [None]:
pseudolabels = pd.read_csv(datadir + "pseudolabels/simple_blend_nn_lgb_hill.csv")
test['scalar_coupling_constant'] = pseudolabels['scalar_coupling_constant']

## Read in bonds files
Taken from:   https://www.kaggle.com/asauve/dataset-with-number-of-bonds-between-atoms  
(thanks Alexandre Sauvé!)

In [None]:
train_bonds = pd.read_csv(datadir + 'predicting-molecular-properties-bonds/train_bonds.csv')
test_bonds = pd.read_csv(datadir + 'predicting-molecular-properties-bonds/test_bonds.csv')

## Reads from angles file
Taken from: https://www.kaggle.com/soerendip/calculate-angles-and-dihedrals-with-networkx
(thanks Rakete!)

In [None]:
angs = pd.read_csv(datadir + "angle-and-dihedral-for-the-champs-structures/angles.csv")

## Read angles and torsions

In [None]:
ang_tor_test = pd.read_hdf(datadir + "ang-tor/test_ang_tor.h5")
ang_tor_train = pd.read_hdf(datadir + "ang-tor/train_ang_tor.h5")

## Read neighbours struct file

In [None]:
neig_struct = pd.read_hdf(datadir + "ang-tor/struct_neighbours.h5")


## Read in the mulliken charges from qm9

In [None]:
mulliken = pd.read_csv(datadir + 'qm9-mulliken/mulliken_charges_fom_qm9.csv')

## Normalize targets so they have are centered around 0 and have max of 1, and one-hot encode coupling types

In [None]:
coups_to_isolate = ['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']
for coup in coups_to_isolate:
    scale_min = train['scalar_coupling_constant'].loc[train.type == coup].min()
    scale_max = train['scalar_coupling_constant'].loc[train.type == coup].max()
    scale_mid = (scale_max + scale_min)/2
    scale_norm = scale_max - scale_mid
    #print(train['scalar_coupling_constant'].loc[train.type == coup].max())

    print(scale_norm, scale_mid)
    
    train['scalar_coupling_constant'].loc[train.type == coup] = (train['scalar_coupling_constant'].loc[train.type == coup] - scale_mid)/scale_norm
    test['scalar_coupling_constant'].loc[test.type == coup] = (test['scalar_coupling_constant'].loc[test.type == coup] - scale_mid)/scale_norm

In [None]:
#scale_min  = train['scalar_coupling_constant'].min()
#scale_max  = train['scalar_coupling_constant'].max()
#scale_mid = (scale_max + scale_min)/2
#scale_norm = scale_max - scale_mid

#train['scalar_coupling_constant'] = (train['scalar_coupling_constant'] - scale_mid)/scale_norm

# One hot encoding gets  too big for Kaggle, let's try label
# use npz now, back to OH
train[['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']] =  pd.get_dummies(train['type'])
test[['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']]  =  pd.get_dummies(test['type'])

#le = preprocessing.LabelEncoder()
#le.fit(['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN'])
#train['l_type'] = (le.transform(train['type']) + 1)/8.
#test['l_type'] = (le.transform(test['type']) + 1)/8.

## Pre-process the structures by one-hot encoding the atom types, and normalize distances to have around max of 1

In [None]:
structures[['C', 'F' ,'H', 'N', 'O']] = pd.get_dummies(structures['atom'])
structures[['x', 'y', 'z']] = structures[['x', 'y', 'z']]/10.

In [None]:
structures = pd.merge(structures, mulliken[['molecule_name', 'atom_index', 'mulliken_charge']], how = 'left',
                  left_on  = ['molecule_name', 'atom_index'],
                  right_on = ['molecule_name', 'atom_index'])

## Process bonds

In [None]:
test_bonds[['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']] = pd.get_dummies(test_bonds['nbond'])#test_bonds['nbond']/3
train_bonds[['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']] = pd.get_dummies(train_bonds['nbond'])#train_bonds['nbond']/3


# Process angles

In [None]:
angs['dihedral'] = angs['dihedral']/np.pi
# Should I rather one-hot this?
angs['shortest_path_n_bonds'] = angs['shortest_path_n_bonds']/6.0
angs = angs.fillna(0)

## Process angles and torsions

In [None]:
MapAtoms = {6.0 : 'C', 7.0 : 'N', 8.0 : 'O'}
ang_tor_test['ThirdAtom']  = ang_tor_test['ThirdAtom'].replace(MapAtoms)
ang_tor_train['ThirdAtom'] = ang_tor_train['ThirdAtom'].replace(MapAtoms)

ang_tor_test['SecondAtom']  = ang_tor_test['SecondAtom'].replace(MapAtoms)
ang_tor_train['SecondAtom'] = ang_tor_train['SecondAtom'].replace(MapAtoms)

In [None]:
#ThirdAtomNames = ['No', 'C1', 'C2', 'C3', 'C+', 'Car', 'O2', 'O3', 'Nam', 'Nar', 'N2', 'N3', 'N3+', 'Npl',  
#        'Ng+', 'Nox']

ThirdAtomNames = ['C','N','O']

T = [_ + "T" for _ in ThirdAtomNames]

#SecondAtomNames = ['No', 'C3', 'N3', 'O3', 'C1', 'C2', 'Nam', 'O2', 'Nar', 'Car',
#       'N2', 'Npl', 'N3+', 'Ng+']
SecondAtomNames = ['C','N','O']

S = [_ + "S" for _ in SecondAtomNames]


ang_tor_test[T] =  pd.get_dummies( ang_tor_test['ThirdAtom'])[ThirdAtomNames]
ang_tor_train[T] =  pd.get_dummies( ang_tor_train['ThirdAtom'])[ThirdAtomNames]

ang_tor_test[S] =  pd.get_dummies( ang_tor_test['SecondAtom'])[SecondAtomNames]
ang_tor_train[S] =  pd.get_dummies( ang_tor_train['SecondAtom'])[SecondAtomNames]

In [None]:
#MapAtoms = dict(enumerate(ThirdAtomNames))
#MapAtoms = {val:key for (key, val) in MapAtoms.items()}

In [None]:
#ang_tor_train['ThirdAtom_l'] = ang_tor_train['ThirdAtom'].replace(MapAtoms)
#ang_tor_test['ThirdAtom_l'] =  ang_tor_test['ThirdAtom'].replace(MapAtoms)

#ang_tor_train['SecondAtom_l'] = ang_tor_train['SecondAtom'].replace(MapAtoms)
#ang_tor_test['SecondAtom_l'] =  ang_tor_test['SecondAtom'].replace(MapAtoms)

#ang_tor_test['ThirdAtom_l'] = ang_tor_test['ThirdAtom_l']/15.
#ang_tor_train['ThirdAtom_l'] = ang_tor_train['ThirdAtom_l']/15.

#ang_tor_test['SecondAtom_l'] = ang_tor_test['SecondAtom_l']/15.
#ang_tor_train['SecondAtom_l'] = ang_tor_train['SecondAtom_l']/15.


In [None]:
ang_tor_test["Angle"] = ang_tor_test["Angle"]/180
ang_tor_train["Angle"] = ang_tor_train["Angle"]/180

In [None]:
ang_tor_test['Torsion'] = ang_tor_test['Torsion']/180
ang_tor_train['Torsion'] = ang_tor_train['Torsion']/180

In [None]:
ang_tor_vals = ['Angle', 'Torsion', 'cosT', 'cos2T'] + S + T#, 'SecondAtom', 'ThirdAtom']

In [None]:
test = pd.merge(test, ang_tor_test[[ 'id'] + ang_tor_vals], how = 'left',
                  left_on  = [ 'id'],
                  right_on = ['id'])

In [None]:
train = pd.merge(train, ang_tor_train[[ 'id'] + ang_tor_vals], how = 'left',
                  left_on  = [ 'id'],
                  right_on = ['id'])

## Process neighbour struct values

In [None]:
neig_struct[[1,6,7,8,9]] = neig_struct[[1,6,7,8,9]]/4.0

In [None]:
structures[['n_H', 'n_C', 'n_N', 'n_O', 'n_F']] = neig_struct[[1,6,7,8,9]]

## Find training and testing molecules, and split structrues into test and train. Then group by molecule


In [None]:
train_mol_names = train['molecule_name'].unique()
test_mol_names  = test['molecule_name'].unique()

train_structures = structures.loc[structures['molecule_name'].isin(train_mol_names)]
test_structures = structures.loc[structures['molecule_name'].isin(test_mol_names)]

train_struct_group = train_structures.groupby('molecule_name')
test_struct_group  = test_structures.groupby('molecule_name')

train_group = train.groupby('molecule_name')
test_group  = test.groupby('molecule_name')

train_bond_group = train_bonds.groupby('molecule_name')
test_bond_group  = test_bonds.groupby('molecule_name')

train_angs = angs.loc[angs['molecule_name'].isin(train_mol_names)]
test_angs = angs.loc[angs['molecule_name'].isin(test_mol_names)]

train_angs_group = train_angs.groupby('molecule_name')
test_angs_group  = test_angs.groupby('molecule_name')

#train_angs_group = ang_tor_train.groupby('molecule_name')
#test_angs_group = ang_tor_test.groupby('molecule_name')


# Find max nodes in graph:
max_size = train_struct_group.size().max()

## Define node and edge values

In [None]:
# Values our nodes will have
node_vals = ['C', 'F' ,'H', 'N', 'O', 'mulliken_charge', 'n_H', 'n_C', 'n_N', 'n_O', 'n_F']
#Values our edges will have (minus distance, for now)
bond_vals = ['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']#['nbond']
j_coup_vals = ['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']#'l_type']
ang_vals = ['shortest_path_n_bonds','cosinus','dihedral']
#ang_vals = ['sp',  'Angle', 'Torsion', 'cosT', 'cos2T'] + ['ThirdAtom_l', 'SecondAtom_l']#+ T + S
edge_vals = j_coup_vals + bond_vals + ang_vals + ang_tor_vals

# Find amount of training molecules
n_train_mols = len(train_mol_names)
n_test_mols = len(test_mol_names)

# Find dim of edges and nodes
bond_dim  = len(bond_vals)
j_coup_dim= len(j_coup_vals)
ang_tor_dim = len(ang_tor_vals)
ang_dim   = len(ang_vals)
node_dim  = len(node_vals)
edge_dim  = len(edge_vals) 

# Additional edge dims for distances
add_edge_dim = 1

## Pre-allocate arrays that we will fill later


In [None]:
def make_arrs(val_group, struct_group, bond_group, ang_group, test):
    i = 0
    for values, structs, bonds, angles in zip(val_group, struct_group, bond_group, ang_group):
        if (not i%1000):
            print(i)

        # Calculate distances
        distances = np.zeros((max_size, max_size, 1))
        coords = structs[1][['x','y','z']].values
        dists  = distance_matrix(coords, coords)
        distances[:dists.shape[0],:dists.shape[1], 0] = dists 
        
        # Create nodes
        mol_info = structs[1][node_vals].values
        nodes = np.zeros((max_size, node_dim))
        nodes[:mol_info.shape[0], :mol_info.shape[1]] = mol_info

        # Create edges
        in_feats = np.zeros((max_size, max_size, j_coup_dim + ang_tor_dim))
        ind = values[1][['atom_index_0', 'atom_index_1' ]].values
        in_feats[ind[:,0], ind[:,1], 0:j_coup_dim + ang_tor_dim] = values[1][j_coup_vals + ang_tor_vals].values
        in_feats[ind[:,1], ind[:,0], 0:j_coup_dim + ang_tor_dim] = in_feats[ind[:,0], ind[:,1], 0:j_coup_dim + ang_tor_dim]

        # Create bonds
        in_bonds = np.zeros((max_size, max_size, bond_dim))
        ind_bonds = bonds[1][['atom_index_0', 'atom_index_1' ]].values
        in_bonds[ind_bonds[:,0], ind_bonds[:,1]] = bonds[1][bond_vals].values
        in_bonds[ind_bonds[:,1], ind_bonds[:,0]] = in_bonds[ind_bonds[:,0], ind_bonds[:,1]]
        
        
        
        # Create angles
        ind_angs = angles[1][['atom_index_0', 'atom_index_1' ]].values
        ang_mat  = np.zeros((max_size, max_size, ang_dim))
        ang_mat[ind_angs[:,0], ind_angs[:,1]]  = angles[1][ang_vals]
        ang_mat[ind_angs[:,1], ind_angs[:,0]]  = ang_mat[ind_angs[:,0], ind_angs[:,1]]
        
        # concat all edge values 
        in_edges = np.concatenate((in_feats, in_bonds, ang_mat, distances),axis=2)



        
        if not test:           
            out_edges = np.zeros((max_size, max_size, 1))
            out_edges[ind[:,0], ind[:,1], 0] = values[1]['scalar_coupling_constant' ].values
            out_edges[ind[:,1], ind[:,0], 0] = out_edges[ind[:,0], ind[:,1], 0]
        

            train_nodes_array[i]      = nodes
            train_in_edges_array[i]   = in_edges
            train_out_edges_array[i]  = out_edges
        else:
            out_edges = np.zeros((max_size, max_size, 1))
            out_edges[ind[:,0], ind[:,1], 0] = values[1]['scalar_coupling_constant' ].values
            out_edges[ind[:,1], ind[:,0], 0] = out_edges[ind[:,0], ind[:,1], 0]
            
            test_nodes_array[i]      = nodes
            test_in_edges_array[i]   = in_edges
            test_out_edges_array[i]  = out_edges
        i = i + 1


In [None]:
train_nodes_array     = np.zeros((n_train_mols, max_size, node_dim), dtype=np.float32) 
train_in_edges_array  = np.zeros((n_train_mols, max_size, max_size, edge_dim + add_edge_dim),dtype=np.float32) 
train_out_edges_array = np.zeros((n_train_mols, max_size, max_size, 1),dtype=np.float32) 



In [None]:
make_arrs(train_group, train_struct_group, train_bond_group, train_angs_group, test = False)

In [None]:
np.savez_compressed("nodes_train.npz" , train_nodes_array)
np.savez_compressed("in_edges_train.npz" , train_in_edges_array)
np.savez_compressed("out_edges_train.npz" , train_out_edges_array)

In [None]:
del train_nodes_array
del train_in_edges_array
del train_out_edges_array

In [None]:
test_nodes_array     = np.zeros((n_test_mols, max_size, node_dim), dtype=np.float32) 
test_in_edges_array  = np.zeros((n_test_mols, max_size, max_size, edge_dim + add_edge_dim),dtype=np.float32) 
test_out_edges_array = np.zeros((n_test_mols, max_size, max_size, 1),dtype=np.float32) 


In [None]:
make_arrs(test_group, test_struct_group, test_bond_group, test_angs_group, test = True)

## Save as numpy arrays

In [None]:
np.savez_compressed("nodes_test.npz" , test_nodes_array)
np.savez_compressed("in_edges_test.npz" , test_in_edges_array)
np.savez_compressed("out_edges_test.npz" , test_out_edges_array)