# Fingerprints

In [2]:
#Loading modules

import deepchem as dc
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import Features
from rdkit.Chem import Fragments as fr
from rdkit.Chem import rdMolDescriptors
from rdkit import DataStructs


import tensorflow as tf

import pandas as pd

from matplotlib import pyplot as plt

import numpy as np

from sklearn import metrics
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")

## Load the data

In [3]:
# Lets load the data, i have preprocessed it, pslit it randomly and exported it in another notebook
#To make it easier to manipulate here

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
val = pd.read_csv('val.csv')

train

Unnamed: 0.1,Unnamed: 0,ids,fr_Al_COO,fr_Al_OH,fr_Al_OH_noTert,fr_ArN,fr_Ar_COO,fr_Ar_N,fr_Ar_NH,fr_Ar_OH,...,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,pain,brenk,nih,tox_bin
0,56,CC1=C(CC(=O)O)c2cc(F)ccc2/C1=C\c1ccc(S(C)(=O)=...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1972,NCCOCCN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0
2,3411,OC[C@H](O)[C@H](O)CO,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1764,CCOC(=O)COc1ccc2c(c1)CC(NCC(O)c1cccc(Cl)c1)CC2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2383,COc1nc(C)nc(NC(=O)NS(=O)(=O)c2ccccc2Cl)n1,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7927,2928,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7928,2598,Clc1ccc(C(Cl)(Cl)Cl)cc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7929,832,O=C(NN=Cc1ccc([N+](=O)[O-])o1)c1ccc(O)cc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7930,3160,C[C@@H]1O[C@@H]1P(=O)([O-])[O-],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [24]:
train = train[['ids', 'tox_bin']]
test = test[['ids', 'tox_bin']]
val = val[['ids', 'tox_bin']]

In [25]:
from rdkit.Chem import AllChem

fp_gen = AllChem.GetRDKitFPGenerator()

In [26]:
train['fp'] = [fp_gen.GetFingerprint(Chem.MolFromSmiles(i)) for i in train['ids']]

In [27]:
test['fp'] = [fp_gen.GetFingerprint(Chem.MolFromSmiles(i)) for i in test['ids']]
val['fp'] = [fp_gen.GetFingerprint(Chem.MolFromSmiles(i)) for i in val['ids']]



In [28]:
train

Unnamed: 0,ids,tox_bin,fp
0,CC1=C(CC(=O)O)c2cc(F)ccc2/C1=C\c1ccc(S(C)(=O)=...,0,"[1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, ..."
1,NCCOCCN,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,OC[C@H](O)[C@H](O)CO,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,CCOC(=O)COc1ccc2c(c1)CC(NCC(O)c1cccc(Cl)c1)CC2,0,"[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, ..."
4,COc1nc(C)nc(NC(=O)NS(=O)(=O)c2ccccc2Cl)n1,0,"[1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
7927,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
7928,Clc1ccc(C(Cl)(Cl)Cl)cc1,1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7929,O=C(NN=Cc1ccc([N+](=O)[O-])o1)c1ccc(O)cc1,1,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
7930,C[C@@H]1O[C@@H]1P(=O)([O-])[O-],1,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [36]:
from IPython.display import clear_output

similarity_matrix = pd.DataFrame()

for i in range(len(train)):
    progress = i / len(train)
    print(f'{progress * 100:.2f}%')
    
    fingerprint = train['fp'][i]

    similarities = [DataStructs.FingerprintSimilarity(fingerprint, train['fp'][j]) for j in range(len(train))]
    similarity_matrix[i] = similarities
    clear_output(wait=False)

similarity_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7922,7923,7924,7925,7926,7927,7928,7929,7930,7931
0,1.000000,0.014925,0.023715,0.309431,0.291921,0.002000,0.081652,0.122115,0.263325,0.026973,...,0.355454,0.187294,0.308834,0.137809,0.468119,0.272361,0.082061,0.213615,0.084191,0.019960
1,0.014925,1.000000,0.103448,0.029070,0.014025,0.000000,0.047619,0.020942,0.018182,0.191489,...,0.017699,0.026374,0.014103,0.032680,0.013582,0.018456,0.006211,0.017575,0.062500,0.250000
2,0.023715,0.103448,1.000000,0.034286,0.016506,0.000000,0.055901,0.014423,0.024311,0.107692,...,0.018610,0.029851,0.024112,0.031056,0.020904,0.033167,0.011364,0.020583,0.074074,0.178571
3,0.309431,0.029070,0.034286,1.000000,0.215290,0.007299,0.085333,0.135458,0.379459,0.028736,...,0.266206,0.158273,0.248709,0.166069,0.331342,0.346440,0.118367,0.212329,0.070796,0.031977
4,0.291921,0.014025,0.016506,0.215290,1.000000,0.002845,0.064185,0.084788,0.207671,0.016690,...,0.235392,0.184647,0.225753,0.135166,0.327948,0.187037,0.096986,0.188626,0.078849,0.015406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7927,0.272361,0.018456,0.033167,0.346440,0.187037,0.008562,0.048529,0.138973,0.295480,0.023295,...,0.249084,0.164586,0.234004,0.155878,0.280654,1.000000,0.097412,0.202960,0.064194,0.028716
7928,0.082061,0.006211,0.011364,0.118367,0.096986,0.006993,0.026820,0.152985,0.094453,0.011905,...,0.094048,0.062157,0.094317,0.080402,0.069024,0.097412,1.000000,0.096519,0.045139,0.018634
7929,0.213615,0.017575,0.020583,0.212329,0.188626,0.001786,0.058733,0.123839,0.222814,0.024433,...,0.210145,0.167251,0.205105,0.169890,0.268852,0.202960,0.096519,1.000000,0.054332,0.015734
7930,0.084191,0.062500,0.074074,0.070796,0.078849,0.000000,0.035336,0.050314,0.057423,0.042781,...,0.071510,0.050260,0.080139,0.048387,0.078815,0.064194,0.045139,0.054332,1.000000,0.086207


In [37]:
similarity_matrix['tox_bin'] = train['tox_bin']

## Dim reduction


In [40]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=3)
similarity_matrix_reduced = tsne.fit_transform(similarity_matrix.drop('tox_bin', axis=1))
similarity_matrix_reduced


array([[ 10.919109 ,  12.239971 ,  -9.892627 ],
       [  1.3834331, -25.75448  ,   5.659273 ],
       [  5.422688 , -23.00139  ,   5.7479596],
       ...,
       [ -6.146349 ,  13.5853815,  -5.2129507],
       [ -3.3144705, -14.333077 ,   6.0430536],
       [  2.016398 , -27.9619   ,   4.837406 ]], dtype=float32)

In [44]:
similarity_matrix_reduced_df = pd.DataFrame(similarity_matrix_reduced)
similarity_matrix_reduced_df

Unnamed: 0,0,1,2
0,10.919109,12.239971,-9.892627
1,1.383433,-25.754480,5.659273
2,5.422688,-23.001390,5.747960
3,1.061262,18.470831,-16.010397
4,1.081329,12.266150,8.533067
...,...,...,...
7927,2.357128,8.091626,-21.637220
7928,-19.165728,-4.589986,-1.561778
7929,-6.146349,13.585382,-5.212951
7930,-3.314471,-14.333077,6.043054


## NN with normal similarity

In [61]:
adjacency_matrix =similarity_matrix.drop('tox_bin', axis=1).astype('float')

adjacency_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7922,7923,7924,7925,7926,7927,7928,7929,7930,7931
0,1.000000,0.014925,0.023715,0.309431,0.291921,0.002000,0.081652,0.122115,0.263325,0.026973,...,0.355454,0.187294,0.308834,0.137809,0.468119,0.272361,0.082061,0.213615,0.084191,0.019960
1,0.014925,1.000000,0.103448,0.029070,0.014025,0.000000,0.047619,0.020942,0.018182,0.191489,...,0.017699,0.026374,0.014103,0.032680,0.013582,0.018456,0.006211,0.017575,0.062500,0.250000
2,0.023715,0.103448,1.000000,0.034286,0.016506,0.000000,0.055901,0.014423,0.024311,0.107692,...,0.018610,0.029851,0.024112,0.031056,0.020904,0.033167,0.011364,0.020583,0.074074,0.178571
3,0.309431,0.029070,0.034286,1.000000,0.215290,0.007299,0.085333,0.135458,0.379459,0.028736,...,0.266206,0.158273,0.248709,0.166069,0.331342,0.346440,0.118367,0.212329,0.070796,0.031977
4,0.291921,0.014025,0.016506,0.215290,1.000000,0.002845,0.064185,0.084788,0.207671,0.016690,...,0.235392,0.184647,0.225753,0.135166,0.327948,0.187037,0.096986,0.188626,0.078849,0.015406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7927,0.272361,0.018456,0.033167,0.346440,0.187037,0.008562,0.048529,0.138973,0.295480,0.023295,...,0.249084,0.164586,0.234004,0.155878,0.280654,1.000000,0.097412,0.202960,0.064194,0.028716
7928,0.082061,0.006211,0.011364,0.118367,0.096986,0.006993,0.026820,0.152985,0.094453,0.011905,...,0.094048,0.062157,0.094317,0.080402,0.069024,0.097412,1.000000,0.096519,0.045139,0.018634
7929,0.213615,0.017575,0.020583,0.212329,0.188626,0.001786,0.058733,0.123839,0.222814,0.024433,...,0.210145,0.167251,0.205105,0.169890,0.268852,0.202960,0.096519,1.000000,0.054332,0.015734
7930,0.084191,0.062500,0.074074,0.070796,0.078849,0.000000,0.035336,0.050314,0.057423,0.042781,...,0.071510,0.050260,0.080139,0.048387,0.078815,0.064194,0.045139,0.054332,1.000000,0.086207


In [62]:
adjacency_matrix.shape

(7932, 7932)

In [85]:
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from sklearn.model_selection import train_test_split


labels = similarity_matrix['tox_bin']



In [86]:
train_adj, test_adj, train_labels, test_labels = train_test_split(adjacency_matrix, labels, test_size=0.2, random_state=42)
train_adj, val_adj, train_labels, val_labels = train_test_split(train_adj, train_labels, test_size=0.2, random_state=42)

In [87]:
# Convert the arrays back to TensorFlow tensors
train_adjacency = tf.convert_to_tensor(train_adj, dtype=tf.float32)
val_adjacency = tf.convert_to_tensor(val_adj, dtype=tf.float32)
test_adjacency = tf.convert_to_tensor(test_adj, dtype=tf.float32)
train_labels = tf.convert_to_tensor(train_labels, dtype=tf.float32)
val_labels = tf.convert_to_tensor(val_labels, dtype=tf.float32)
test_labels = tf.convert_to_tensor(test_labels, dtype=tf.float32)

In [88]:
train_adjacency


<tf.Tensor: shape=(5076, 7932), dtype=float32, numpy=
array([[0.03996194, 0.11009175, 0.14166667, ..., 0.04153354, 0.07438017,
        0.10810811],
       [0.13869257, 0.02258065, 0.03738318, ..., 0.09702458, 0.05800464,
        0.03571429],
       [0.3400749 , 0.01743462, 0.02083333, ..., 0.20896861, 0.05869325,
        0.01612903],
       ...,
       [0.26838234, 0.03150685, 0.02670227, ..., 0.24418604, 0.07729468,
        0.02303523],
       [0.09763033, 0.05084746, 0.07446808, ..., 0.10477658, 0.08333334,
        0.07428572],
       [0.1122172 , 0.04897959, 0.07905138, ..., 0.08689655, 0.06738544,
        0.11158799]], dtype=float32)>

In [90]:
# Step 3: Define the GCNN architecture
class GCNN(tf.keras.Model):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCNN, self).__init__()
        self.gc1 = tf.keras.layers.Dense(hidden_dim, activation='relu')
        self.gc2 = tf.keras.layers.Dense(output_dim)
        
    def call(self, inputs):
        x = self.gc1(inputs)
        x = self.gc2(x)
        return x


In [91]:
input_dim = adjacency_matrix.shape[1]  # Number of features in the similarity matrix
hidden_dim = 1000  # Number of hidden units in the GCNN
output_dim = 1  # Number of output units (toxic or not)


model = GCNN(input_dim, hidden_dim, output_dim)

In [92]:
# Step 4: Train the GCNN model
num_epochs = 100
batch_size = 32
learning_rate = 0.001

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.BinaryCrossentropy()

train_dataset = tf.data.Dataset.from_tensor_slices((train_adjacency, train_labels)).batch(batch_size)

for epoch in range(num_epochs):
    for batch_adjacency, batch_labels in train_dataset:
        with tf.GradientTape() as tape:
            logits = model(batch_adjacency)
            loss_value = loss_fn(batch_labels, logits)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [95]:
# Step 5: Evaluate the GCNN model
val_logits = model(val_adjacency)
val_predictions = tf.sigmoid(val_logits)
val_predictions_binary = tf.cast(val_predictions > 0.6, dtype=tf.float32)

accuracy = tf.reduce_mean(tf.cast(tf.equal(val_predictions_binary, val_labels), dtype=tf.float32))
print(f'Accuracy: {accuracy:.4f}')


Accuracy: 0.4838
