## Sampling positive smiles

In [8]:
import pandas as pd
import numpy as np

In [3]:
pos_csv = pd.read_csv('MCHR1_patent_clean.tsv', sep='\t')

In [5]:
print(len(pos_csv))
pos_csv.head()

753


Unnamed: 0,Structure,acname,activity
0,O=C(O)c1ccc(NCc2cccc(Br)c2)cc1,IC50,3.127962
1,Clc1ccc(CN(Cc2ccc(Cl)cc2)c2nn[nH]n2)cc1,IC50,4.226597
2,Clc1ccc(-n2c(SC/C=C/c3ccccc3)nnc2-c2cccnc2)cc1,IC50,4.775484
3,O=C1N/C(=C\c2ccc(CNCc3ccccc3)cc2)C(=O)N1c1ccc(...,Ki,5.298317
4,O=C(c1nnc(-c2ccccc2)o1)N1CC(Oc2ccc(CN3CC4(COC4...,IC50,11.417615


In [12]:
np.random.seed(0)
pos_smiles = pos_csv.sample(n=50).Structure.tolist()

In [13]:
pos_smiles

['CN[C@H]1CCN(c2ccc(N3Cc4cn(-c5ccc(Cl)cc5)nc4C3=O)cn2)C1',
 'COc1cc(NC(=O)c2ccc(-c3ccccc3)cc2)ccc1OCCN1CCCC1',
 'Cc1c(C2CC2)nc2ccc(-n3ccc(OCc4ccc(Cl)cn4)cc3=O)cn12',
 'O=C1N/C(=C\\c2ccc(CNC3CCCC3)cc2)C(=O)N1c1ccc(Oc2ccccc2)cc1',
 'Cc1c2cc(-n3ccc(OCc4nc(C(F)(F)F)cs4)cc3=O)ccc2nn1C',
 'N#Cc1ccc(-c2ccc(C3(NCC(=O)Nc4ccc(F)c(Cl)c4)CCN(C4CCCC4)CC3)cc2)cc1',
 'Cc1ccc2c(c1)nc(C)n2C1CCC(NC2Cc3ccc(Cl)cc3C2)CC1',
 'N#Cc1cccc(-c2ccc3c(c2)CCN(CC(=O)Nc2ccc(Cl)cc2)C32CCN(CC3CC3)CC2)c1',
 'COc1cc(N2Cc3ccc(-c4ccc(Cl)cc4)nc3C2=O)ccc1OCCN1CCCC1',
 'O=C(NCCc1ccccn1)C1CCN(Cc2ccn(-c3ccc(C(F)(F)F)cc3)c2)CC1',
 'Cc1ccc(NC(=O)C(C)C)cc1C1CCN(CCCNC(=O)C(c2ccc(F)cc2)c2ccc(F)cc2)CC1',
 'O=C(COc1cccc(Cl)c1)NC1CCN(Cc2ccn(-c3ccc(C(F)(F)F)cn3)c2)CC1',
 'COc1cc(-n2cnc3cc(-c4ccc(Cl)cc4)sc3c2=O)ccc1OCC1(O)CC(F)(F)C1',
 'N#Cc1cccc(-c2ccc(C(=C3CCN(CC4CC4)CC3)c3nc4cc(Cl)c(Cl)cc4[nH]3)cc2)c1',
 'N#Cc1cccc(-c2ccc3c(c2)CCN(CC(=O)Nc2cc(Cl)cc(Cl)c2)C32CCNCC2)c1',
 'O=C1N/C(=C\\c2ccc(CN3CCN(c4ccccc4)CC3)cc2)C(=O)N1c1ccc(Oc2ccccc2

In [14]:
with open('positives.smi', 'w') as f:
    for s in pos_smiles:
        f.write(s + '\n')

## Combine receptor and ligands to make input file

In [2]:
import pandas as pd

receptor_file = "mchr1/mchr1_data/mchr1_active/mchr1_active_2_0.79.pdb"
pos_ligands_file = "positives.smi"
neg_ligands_file = "negatives.smi"

with open(pos_ligands_file) as f:
    ligands = f.read().splitlines()

with open(neg_ligands_file) as f:
    ligands += f.read().splitlines()

data = []
for lig_id, ligand in enumerate(ligands):
    data.append((f"ligand_{lig_id}", receptor_file, ligand, None))

data = pd.DataFrame(data, columns=["complex_name", "protein_path", "ligand_description", "protein_sequence"])
data.to_csv("diffdock_input.csv", index=False)    