## Import libraries

In [1]:
%load_ext autoreload

In [2]:
from rdkit.Chem import Descriptors
import pandas as pd
from utils_mmp import get_mms
%autoreload 2

In [3]:
import logging
logging.basicConfig()
root_logger = logging.getLogger()
root_logger.setLevel(logging.WARN)

In [4]:
from ccrlib import *
from ccrlib.fragmentation import *
logger.setLevel(logging.WARN)

## Load data

In [5]:
chembl_high_conf_tids = pd.read_csv("./../dataset/chembl_33_pIC50.csv").rename(columns={'nonstereo_aromatic_smiles': 'smiles'})
display(chembl_high_conf_tids)

Unnamed: 0,smiles,standard_type,pPot,cid,tid
0,Brc1cc2c(NCc3ccccc3)ncnc2s1,IC50,6.617983,CHEMBL3416599,203
1,Brc1cc2c(NCc3ccccn3)ncnc2s1,IC50,5.102153,CHEMBL3416616,203
2,Brc1cc2c(NCc3cccs3)ncnc2s1,IC50,5.862013,CHEMBL3416619,203
3,Brc1cc2c(NCc3ccncc3)ncnc2s1,IC50,5.410833,CHEMBL3416614,203
4,Brc1cc2c(Nc3ccccc3)ncnc2s1,IC50,7.096910,CHEMBL3416621,203
...,...,...,...,...,...
16524,c1csc(-c2n[nH]c3c2Cc2ccccc2-3)c1,IC50,6.031517,CHEMBL212899,279
16525,c1ncc(-c2cc3c(cn2)[nH]c2ncc(-c4ccc(CN5CCCCC5)c...,IC50,6.575118,CHEMBL3582232,220
16526,c1ncc(-c2cc3c(cn2)[nH]c2ncc(-c4ccc(CN5CCCCC5)c...,IC50,6.490797,CHEMBL3582223,220
16527,c1ncc(-c2cc3c(cn2)[nH]c2ncc(-c4ccc(CN5CCCCC5)c...,IC50,6.304518,CHEMBL3582224,220


In [7]:
chembl_high_conf_tids.tid.value_counts()

tid
279     2475
325     1990
220     1898
203     1586
4005    1534
260     1495
1865    1494
2409    1410
284     1359
3717    1288
Name: count, dtype: int64

#### Relevant parameters

In [9]:
cut_type= "synthesizable"
max_cuts = 1
min_rel_core_size=0.666
max_frag_size = 13
max_time=100
mol_filter = lambda x: Descriptors.MolWt(x)<=1000

## MMP Generation

In [10]:
max_xch_difference = 8 # Set to -1 to ignore

# Generate MMPs and save 

In [11]:
for tid in chembl_high_conf_tids.tid.unique()[:]:

    print(tid)
    df_tid = chembl_high_conf_tids.loc[chembl_high_conf_tids.tid == tid][['smiles', 'cid', 'tid']]
    #df_tid.to_csv(f"./mmp_results/chembl_33_IC50_{tid}.tsv", sep='\t', columns=['smiles', 'cid', 'tid'], index=False)
    #display(df_tid)
    suppl = Chem.SmilesMolSupplier(f"./mmp_results/chembl_33_IC50_{tid}.tsv", delimiter='\t', smilesColumn=0, nameColumn=1)
    mmp_result = run_mmp(suppl, cut_type, max_cuts, min_rel_core_size,
                         max_frag_size, max_xch_difference, max_time,mol_filter, basename= None)


    print("Verification " + ("okay" if verify_series(mmp_result.mms, mmp_result.smiles) else "failed"))
    print("Verification " + ("okay" if verify_mmps(mmp_result.mmps, mmp_result.smiles) else "failed"))
    
    df_mms = get_mms(mmp_result)
    #df_mms.to_csv(f"./mmp_results/df_mmp_{tid}.csv", index=False)

203
# smiles: 1586
# duplicates: 0
# discarded molecules: 0
Cuts
# frames: 3488
# cuts:  4722
Time:  1.4952476024627686
FPS:  3158.00539805084
Raw MMS
# frames: 536
# cpds:  1770
MMPs
# MMPs: 3100
MMS
# frames: 355
# cpds:  1294
Verification okay
Verification okay
