To open on Google Colab [link](https://colab.research.google.com/github/RodrigoAVargasHdz/CHEM-4PB3/blob/main/Course_Notes/Week8/convNet.ipynb)

In [None]:
!pip install rdkit-pypi

# CHEMINFORMATICS in the era of ML

[paper](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2523-5)

In [None]:
import pandas as pd
import numpy as np
import pandas as pd
# rdkit stuff
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import PandasTools

import matplotlib
import matplotlib.pyplot as plt

In [None]:
# data from https://github.com/aspuru-guzik-group/chemical_vae
data_url = "https://github.com/aspuru-guzik-group/chemical_vae/raw/main/models/zinc_properties/250k_rndm_zinc_drugs_clean_3.csv"
data_total = pd.read_csv(data_url)

# The total number of data points is HUGE, lets sample 10K random
N = 10000
data = data_total.sample(N)
print(data.head)

# ----------------------------------------------
# Extra dataset for Classification for Toxicity 
# be careful as the data set is UNBALANCE
# data_url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz"
# df = pd.read_csv(data_url, compression='gzip', header=0,
                #  sep=',', quotechar='"', error_bad_lines=False)


In [None]:
plt.figure(figsize=(8,8))
plt.hist(data_total.logP.to_numpy(),bins=1000,density=True,label='N = %s'%data_total.shape[0])
plt.hist(data.logP.to_numpy(),bins=200,density=True,label='N = %s'%data.shape[0])
plt.xlabel('logP')
plt.ylabel('Counts')
plt.legend()

In [None]:
def get_canonical_smiles(molec_smiles):
    molecule = AllChem.MolFromSmiles(molec_smiles)
    return AllChem.MolToSmiles(molecule, canonical=True)

In [None]:
smiles_all = data.smiles.to_list()
length_smiles = []
for s in smiles_all:
    length_smiles.append(len(s))
length_smiles = np.array(length_smiles)


i_min = np.argmin(length_smiles)
i_max = np.argmax(length_smiles)
print(i_min,i_max)
print('Smallest molecule (%s), %s'%(length_smiles[i_min],smiles_all[i_min]))
print('Largest molecule (%s), %s' % (length_smiles[i_max], smiles_all[i_max]))


plt.figure(figsize=(10,8))
plt.hist(length_smiles,bins=100)
plt.ylabel('Count')
plt.xlabel('Length of SMILES',fontsize=18)

In [None]:
# ALPHABET: define SMILES characters 

max_len = 100
SMILES_CHARS = ["7", "6", "o", "]", "3", "s", "(", "-", "S", "/", "B", "4", "[", ")", "#", "I",
                "l", "O", "H", "c", "1", "@", "=", "n", "P", "8", "C", "2", "F", "5", "r", "N", "+", "\\", " "]
# index
smi2index = dict((c, i) for i, c in enumerate(SMILES_CHARS))


def smiles_to_one_hot(smiles, maxlen=max_len):
    X = np.zeros((maxlen, len(SMILES_CHARS)))  # (maxlen, dictionary)
    # print(smiles,type(smiles))
    smiles = smiles.replace('\n', '')
    for i, c in enumerate(smiles):
        X[i, smi2index[c]] = 1
    return X


# caffeine one hot
caffeine_smiles = 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C'
print(caffeine_smiles.split())

caffeine_one_hot = smiles_to_one_hot(caffeine_smiles)

print(caffeine_one_hot.shape)  # (120, 56)


plt.figure(figsize=(20,20))
plt.imshow(caffeine_one_hot.T,cmap='binary')
# plt.xlabel('Tokens')
# plt.ylabel('SMILES')


caffeine_smiles_pad = caffeine_smiles + " " * (max_len - len(caffeine_smiles))

plt.title('One-hot encoding for %s'%caffeine_smiles)
plt.xticks(np.arange(len(list(caffeine_smiles))),
           list(caffeine_smiles), fontsize=8)
plt.yticks(np.arange(len(list(SMILES_CHARS))),
           list(SMILES_CHARS), fontsize=8)


## Data preprocessing



In [None]:
import torch
from torch import nn
import torch.functional as F
from torch.utils.data import Dataset, DataLoader


In [None]:
class SMILESDataset(Dataset):
    def __init__(self, SMILES_all, SMILES_labels, flatten=False):
        self.molecules_labels = SMILES_labels
        self.molecules_all = SMILES_all
        self.max_len = 100
        self.flatten = flatten

    def __len__(self):
        return len(self.molecules_labels)

    def __getitem__(self, idx):
        molec = self.molecules_all[idx]
        label = self.molecules_labels[idx]

        molec_one_hot = torch.tensor(smiles_to_one_hot(molec,self.max_len)).double()
        molec_one_hot = molec_one_hot.unsqueeze(0).float()
            
        return molec_one_hot,label, molec

In [None]:
from sklearn.model_selection import train_test_split
# load the data

smiles_all = data.smiles.to_list()
logP_all= data.logP.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    smiles_all, logP_all, test_size=0.25, shuffle=False
)


In [None]:
molec_data_train = SMILESDataset(X_train, torch.tensor(y_train))
train_dataloader = DataLoader(molec_data_train, batch_size=1, shuffle=True)

for i, data in enumerate(train_dataloader,0):
    xi,logP_i, smiles_i = data
    print(xi.shape,logP_i,smiles_i)