In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from pathlib import Path
import os

def change_directory_to_repo():
    """Changes working directory to the repository root folder."""
    current_dir = Path.cwd()
    for parent in current_dir.parents:
        # Repository is the first folder with the .git folder
        files = list(parent.glob(".git"))
        if files:
            os.chdir(str(parent))

change_directory_to_repo()

In [16]:
smiles = pd.read_csv('data/raw/smiles.tsv.gz', compression='gzip', sep='\t')
smiles.head()

Unnamed: 0,molregno,canonical_smiles
0,1,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl
1,2,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1
2,3,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1
3,4,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1
4,5,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1


In [6]:
print('Max molregno', smiles['molregno'].max())
print('Total rows', smiles.shape[0])

Max molregno 2341634
Total rows 1941411


In [7]:
activity = pd.read_csv('data/raw/activities.tsv.gz', compression='gzip', sep='\t')
activity.head()

Unnamed: 0,activity_id,assay_id,molregno,pchembl_value
0,928556,292,9988,7.19
1,975432,1048,133063,5.36
2,1258200,2257,238731,
3,687341,2259,220767,8.44
4,696607,2259,2214,8.09


In [8]:
id_with_affinity = set(activity['molregno'].to_list())

In [9]:
all_id = set(smiles['molregno'].to_list())

In [10]:
print(f'Known affinity for {len(id_with_affinity)} molecules')
print(f'Known smiles for {len(all_id)} molecules')

smiles_with_affinity = all_id.intersection(id_with_affinity)
print(f'Known both affinity and smiles for {len(smiles_with_affinity)}')

Known affinity for 6524 molecules
Known smiles for 1941411 molecules
Known both affinity and smiles for 6492


In [11]:
(activity['molregno'].value_counts() > 1).sum()

2126

In [12]:
activity['molregno'].value_counts()

2214      46
2261      37
3683      34
7714      21
34197     20
          ..
296630     1
296837     1
296817     1
296660     1
9988       1
Name: molregno, Length: 6524, dtype: int64

In [28]:
activity.iloc[1382]['pchembl_value']

nan

In [31]:
filtered_smiles = []
filtered_affinity = []

for molregno in smiles_with_affinity:
    new_smiles = smiles[smiles['molregno'] == molregno]['canonical_smiles'].iloc[0]
    filtered_smiles.append(new_smiles)

    all_affinities = activity[activity['molregno'] == molregno]['pchembl_value']
    if all_affinities.shape[0] > 1:
        new_affinity = all_affinities.mean(skipna=True) > 8.0
    else:
        new_affinity = all_affinities.iloc[0]
        if np.isnan(new_affinity):
            new_affinity = False
        else:
            new_affinity = new_affinity > 8.0
    filtered_affinity.append(int(new_affinity))

In [35]:
filtered_data = pd.DataFrame({
    'smiles': filtered_smiles,
    'filtered_affinity': filtered_affinity
})
filtered_data

Unnamed: 0,smiles,filtered_affinity
0,CC(=O)OC12C=C(C)C(C(C)C1)C1C(=O)N(CCCN3CCN(c4c...,0
1,COc1ccccc1N1CCN(CCCN2C(=O)C3C4C(=O)CC(C)(CC4C)...,0
2,CC1CC2(C)CC(=O)C1C1C(=O)N(CCCN3CCN(c4ccccc4)CC...,0
3,COc1ccccc1N1CCN(CCCON2C(=O)C3C4C(C)=CC(OC(C)=O...,0
4,COc1ccc(Cl)cc1S(=O)(=O)n1cc(/C=N/N=C/N(C)C)c2c...,0
...,...,...
6487,CC1CC2(C)CC(=O)C1C1C(=O)N(CCCCN3CCN(c4ccc(F)cc...,0
6488,CC1CC2(C)CC(=O)C1C1C(=O)N(CCCCN3CCN(Cc4ccccc4)...,0
6489,COc1ccccc1N1CCN(CCCN2C(=O)C3C4C(C)=CC(OC(C)=O)...,0
6490,CC(=O)c1c(OCCCN2CCN(c3ccccc3C#N)CC2)ccc2c(C)cc...,0


In [36]:
filtered_data.to_csv('data/preprocessed/filtered_data.csv')

In [45]:
data = pd.read_csv('data/preprocessed/filtered_data.csv', index_col=0)

from utils import utils
rng = utils.set_deafult_seed()

from sklearn.model_selection import train_test_split
train_and_val, test = train_test_split(data, test_size=0.2, random_state=rng)

train, val = train_test_split(train_and_val, test_size=0.1, random_state=rng)




Global seed set to 228


In [46]:
train

Unnamed: 0,smiles,filtered_affinity
1976,COc1cc(OC)c2c(c1CO)C[C@H]1c3c(cc(OC)c(OC)c3OC)...,0
1034,CN(C)CCSC(C)(C)C,0
6303,COc1cccc(NC(=O)NCCCN2CCC(Cc3ccccc3)CC2)c1,0
1932,CC(C)(O)C(=O)N[C@H]1CC[C@H](CCN2CCN(c3nccc4c3C...,1
3707,Cc1cccc(N2CCN(CCCCNC(=O)c3cc4ccccc4o3)CC2)n1,0
...,...,...
3234,CCCc1nn(C)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(C)...,0
4910,Cc1ccc(Sc2ccccc2N2CCN(CCc3c(C)nc4n(c3=O)CCCC4O...,0
6099,COc1cccc(NC(=O)NCCCN2CCC(Cc3ccc(F)cc3)CC2)c1,0
1403,CCNC(=O)Nc1cccc(CCN2CCN(c3cccc4nc(C)ccc34)CC2)c1,0


In [47]:
test

Unnamed: 0,smiles,filtered_affinity
1217,CCN1CCN(c2cc(-c3ccc(F)cc3)c3c(n2)CC[C@H](O)CCC...,1
553,CCCCNC(=O)N1OCC2CSc3ccc(F)cc3C21,0
2777,C[C@@H]1CNC[C@@H]2Cc3ccc(Cl)nc3N21,0
2724,CCn1oc2c(c1=O)CCN(C)CC2,0
856,OC12C3C4CC5C6C4C1C6C(C53)N2Cc1ccccn1,0
...,...,...
3017,O=CN[C@H]1CC[C@H](CCN2CCC(c3cccc4c3CCO4)CC2)CC1,0
4558,CN(C1CCN(CCOc2ccccc2C(C)(C)C)CC1)S(=O)(=O)c1cc...,0
237,CC(C)n1nc(C(=O)NCC2CCN(CCc3ccc(N)cc3)CC2)c2ccc...,0
4745,CN1CCC(N(C)S(=O)(=O)c2cccc3ccccc23)CC1,0


In [48]:
val

Unnamed: 0,smiles,filtered_affinity
6223,O=C(N[C@H]1CC[C@H](CCN2CCC(c3cccc4c3OCO4)CC2)C...,0
5823,CC(=O)c1c(OCCCN2CCN(c3cc(C)cc(C)c3)CC2)ccc2c(C...,0
591,CC(C)NCCCc1cccc2[nH]cc(S(=O)(=O)c3ccccc3)c12,0
462,COc1ccccc1OCC(O)CO,0
3673,CCc1cc(Cl)c2c(c1)[C@H]1CNC[C@@H]1NC2=O.Cl,0
...,...,...
5414,CCc1cccc2c1C(=O)N1CCNC[C@H]1C2,0
409,COc1ccc(N2Cc3cccc(C(F)(F)F)c3C2=O)cc1OCCN1CCC(...,0
1108,CCCN1CCC(c2c[nH]c3cc(F)ccc23)CC1,0
4418,O=C1CCc2ccc(OCCCCN3CCN(c4cccc(Cl)c4Cl)CC3)cc2N1,0


In [49]:
train.to_csv('data/preprocessed/train.csv')
test.to_csv('data/preprocessed/test.csv')
val.to_csv('data/preprocessed/val.csv')

In [63]:
from rdkit import Chem

# molecule = Chem.MolFromSmiles(smiles)

def get_topological_fingerprint(molecule: Chem.rdchem.Mol):
    return Chem.RDKFingerprint(molecule).ToList()

result = get_topological_fingerprint(Chem.MolFromSmiles('CCc1cccc2c1C(=O)N1CCNC[C@H]1C2	'))
    



In [86]:
def convert_dataset(dataset: pd.DataFrame):
    features = []
    affinity = []

    bad_smiles = []

    for row in dataset.itertuples():
        try:
            molecule = Chem.MolFromSmiles(row[1])
            print(row[1])
            fingerprint = get_topological_fingerprint(molecule)
            features.append(fingerprint)
            affinity.append(row[2])
        except IndexError:
            bad_smiles.append(row[1])
    
    return features, affinity, bad_smiles

In [87]:
features, affinity, bad = convert_dataset(train)

COc1cc(OC)c2c(c1CO)C[C@H]1c3c(cc(OC)c(OC)c3OC)CCN1C2
CN(C)CCSC(C)(C)C
COc1cccc(NC(=O)NCCCN2CCC(Cc3ccccc3)CC2)c1
CC(C)(O)C(=O)N[C@H]1CC[C@H](CCN2CCN(c3nccc4c3CCO4)CC2)CC1
Cc1cccc(N2CCN(CCCCNC(=O)c3cc4ccccc4o3)CC2)n1
C[C@@H]1SC(c2ccccc2)=N[C@@H]1Cc1c[nH]cn1
COc1cc(C[C@@H](C)N)c(OC)cc1Br
CO[C@H](C)C(=O)N[C@H]1CC[C@H](CCN2CCN(c3nccc4c3CCO4)CC2)CC1
Nc1cccc(-c2ccc(CCN3CCN(c4cccc5cccnc45)CC3)cc2)n1
Nc1c(Br)cc(Br)cc1CN[C@H]1CC[C@H](O)CC1
CCCc1cc2c(cc1Cl)N(C(=O)Nc1cccnc1)CC2
O=S(=O)(NCCN1CCC(c2noc3cc(F)ccc23)CC1)c1ccc2ccccc2c1
CCCc1cc2c(c(C(F)(F)F)c1)C(=O)N1CCNC[C@@H]21
Cl.NC[C@@H]1C[C@H]1c1ccccc1
O=C(NCCCCN1CCN(c2ccccc2F)CC1)c1cc2ccccc2o1
COc1ccc(N2CCN(c3ccccc3Cl)C2=O)cc1OCCN1CCCCC1
CCc1cc(Cl)c2c(c1)[C@@H]1CNCCN1C2=O
CN(C1CCN(CCOc2ccccc2-c2ccccc2)CC1)S(=O)(=O)c1ccccc1
O=[N+]([O-])c1ccc2c(c1)C(O)C(CCN1CCC(c3noc4cc(F)ccc34)CC1)CC2
O=Cc1csc(-c2ccccc2O)n1
COc1ccc(CCN(CCCc2ccccc2Br)CCc2ccc3c(c2)OCO3)cc1OC
c1ccc(OCc2cc(OC[C@@H]3CCCN3)no2)cc1
c1ccc(-c2[nH]c3ccccc3c2CCN2CCCCC2)cc1
COc1cccc(CCCN(CCc2cc

In [91]:
all_features = np.row_stack(features)

In [92]:
all_features.shape

(4672, 2048)

In [97]:
np.array(affinity)

array([0, 0, 0, ..., 0, 0, 0])

In [98]:
train = np.load('data/preprocessed/train.npz')

In [99]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(subsample=0.1)

In [100]:
gb.fit(train['X'], train['y'])

GradientBoostingClassifier(subsample=0.1)

In [101]:
gb.score(train['X'], train['y'])

0.9304366438356164

In [102]:
val = np.load('data/preprocessed/val.npz')

In [103]:
gb.score(val['X'], val['y'])

0.9115384615384615