## Filter top merges from Tim's three hop enumeration

In [32]:
from rdkit import Chem
from rdkit.Chem import Crippen
import pandas as pd

In [33]:
# Let's have a look at the top 10 poses list
top_10_poses = Chem.SDMolSupplier('in_data/score_3_hop/ScoreGalaxyGen3Ligands-top10-xcos.sdf')

In [34]:
from datetime import datetime

date = datetime.today().strftime('%Y-%m-%d')

def getSDFprops(compound_mol, index):
    
    # Make _Name = original SMILES 
    compound_mol.SetProp('original SMILES', compound_mol.GetProp('_Name'))
    
    # Sort out name
    compound_mol.SetProp('_Name', 'Exp_{}'.format(index))
    
    # Assign ref pdb
    compound_mol.SetProp('ref_pdb', 'receptor_pdbs/' + compound_mol.GetProp('Receptor') + '.pdb')
    
    # Assign ref mol
    compound_mol.SetProp('ref_mols', compound_mol.GetProp('XCos_RefMols'))
    
    # Rename XCOS scores
    compound_mol.SetProp('Score_1', compound_mol.GetProp('XCos_Score1'))
    compound_mol.SetProp('Score_2', compound_mol.GetProp('XCos_Score2'))
    compound_mol.SetProp('Score_3', compound_mol.GetProp('XCos_Score3'))


    # Get all the sdf properties
    all_properties = list(compound_mol.GetPropsAsDict().keys())
    

    # Properties to keep 
    keep_properties = ['ref_pdb', 'ref_mols', 'original SMILES', 
                       'Score_1', 'Score_2','Score_3'] 

    # Properties to delete
    del_properties = [prop for prop in all_properties if prop not in keep_properties]

    for prop in del_properties:
            compound_mol.ClearProp(prop)

    return compound_mol


def getBlankMol(blank_mol, COS_threshold, rad_threshold):
    
    # Add compulsory props
    blank_mol.SetProp('_Name', 'ver_1.2')
    blank_mol.SetProp('ref_mols', 'Fragments that bits overlap with above a score threshold of {}.'.format(COS_threshold))
    blank_mol.SetProp('ref_url', 'https://github.com/Waztom/xchem-xCOS')
    blank_mol.SetProp('submitter_name', 'WT')
    blank_mol.SetProp('submitter_email', 'warren.thompson@diamond.ac.uk')
    blank_mol.SetProp('submitter_institution', 'Diamond Light Source')
    blank_mol.SetProp('generation_date', date)
    blank_mol.SetProp('method', 'xCOS')

    # Add scoring descriptors
    blank_mol.SetProp('Score_1', 'The score is scaled by the number of heavy bit atoms')    
    blank_mol.SetProp('Score_2', 'The score is scaled by the number of heavy bit atoms penalised by the fraction of feats matched the to total number feats clustered within a {} angstrom threshold'.format(rad_threshold))    
    blank_mol.SetProp('Score_3', 'The score is determined by the fraction of matching features to the clustered features within a {} angstrom threshold.'.format(rad_threshold))

    return blank_mol

In [35]:
# Let's filter the list to the top 100 compounds using XCOS-Heavy as Score. Use Icm browser to select range.
# Get writer set up for writing final mols to file
w = Chem.SDWriter('out_data/score_3_hop/Top_10_poses_XCOS_heavyfeat_compounds.sdf')

# Write in blank mol for RHS upload

blank_mol = Chem.MolFromSmiles('C')

# Assign required props for ver 1.2 spec
blank_mol = getBlankMol(blank_mol, COS_threshold=0.5, rad_threshold=1.0)

# Write to file
w.write(blank_mol)

index = 1

for mol in top_10_poses:
    
    XCOS_heavyfeat = float(mol.GetProp('XCos_Score2'))
    
    if XCOS_heavyfeat >= 1.642 :
        
        # Sort out props
        mol = getSDFprops(mol, index)
        
        index += 1
        
        # Write to file
        w.write(mol)

w.close()


# Stop here

In [None]:
for mol in top_10_poses:
    XCOS_heavy_score = float(mol.GetProp('XCos_Score1'))
    crippen_log_P = Crippen.MolLogP(mol)
    if XCOS_heavy_score > 33.24 and crippen_log_P < 5:
        # Write to file
        w.write(mol)
w.close()

In [None]:
# Let's create an sdf with the top scoring compound to compare the different scores and conformers
w = Chem.SDWriter('out_data/score_3_hop/Top_pose_XCOS_heavy.sdf')
for mol in top_10_poses:
    name = mol.GetProp('_Name')
    if name == 'CCCN(CCNC(=O)[C@@H]1C[C@H]1C1CCC1)C(=O)c1cc(C)on1':
        # Write to file
        w.write(mol)
w.close()

In [6]:
# Ok so top scorers also have poses that score low - let's filter by taking the average of
# the XCOS Score 1 to get top scorers
data = [(mol.GetProp('IsoSmiles'), float(mol.GetProp('XCos_Score1'))) for mol in top_10_poses]

In [7]:
df = pd.DataFrame(data, columns = ['Name','XCOS_Score1']) 

In [8]:
df.head()

Unnamed: 0,mol,Name,XCOS_Score1
0,<rdkit.Chem.rdchem.Mol object at 0x7f92c9e06c60>,C#C[C@@H](N)c1ccc(Cl)cc1Br,8.2849
1,<rdkit.Chem.rdchem.Mol object at 0x7f92c9e06b70>,C#C[C@H](NC(=O)[C@H]1[C@@H]2C[C@H]1CN2)C(C)C,12.1325
2,<rdkit.Chem.rdchem.Mol object at 0x7f92c9e06940>,C#C[C@H](NC(=O)[C@H]1[C@@H]2C[C@H]1CN2)C(C)C,12.3752
3,<rdkit.Chem.rdchem.Mol object at 0x7f92c9e062b0>,C#C[C@H](NC(=O)[C@@H]1[C@H]2CN[C@@H]1C2)C(C)C,10.6628
4,<rdkit.Chem.rdchem.Mol object at 0x7f92c9e06800>,C#C[C@H](NC(=O)[C@@H]1[C@H]2CN[C@@H]1C2)C(C)C,10.7523


In [14]:
# Get average scores
# Group data into unique feature types and get mean of XCOS Score 1
grouped_df = df.groupby('Name')

data_to_add = []

for group_name, df_group in grouped_df:
    
    # Get compound smiles
    name = df_group.Name.unique()[0]
    
    # Get number of poses
    no_poses = len(df_group)
    
    # Get average XCOS score
    avg_score = df_group.XCOS_Score1.mean()
    
    data_line_add = (name,no_poses,avg_score)
    
    data_to_add.append(data_line_add)

In [15]:
avg_df = pd.DataFrame(data_to_add, columns = ['Name','No_poses','Avg_XCOS'])

In [16]:
avg_df.head()

Unnamed: 0,Name,No_poses,Avg_XCOS
0,BrCC(CBr)(CCCOCc1ccccc1)C1CCC1,5,22.33826
1,BrCC(CBr)(CCOCc1ccccc1)C1CCC1,5,23.03464
2,BrCC(CBr)(COCc1ccccc1)C1CCC1,5,22.04556
3,BrCCCCCN1[C@H]2CC[C@@H]1CC2,10,13.67159
4,BrCCCCN1[C@H]2CC[C@@H]1CC2,10,10.40736


In [18]:
# Get top scorers
avg_df = avg_df.sort_values('Avg_XCOS', ascending=False)

In [22]:
# Get 100 top scorers
top_scorers = avg_df.head(500)

In [23]:
top_scorers

Unnamed: 0,Name,No_poses,Avg_XCOS
13466,CCCC[C@@H](CNC(=O)c1cc(C)on1)NC(=O)[C@H]1C[C@@...,1,34.058200
13021,CCCCCCCCCCCC(=O)N/N=C(/CC)c1ccccc1O,1,33.248900
71765,Cc1cc(CCC(=O)N[C@H](CCNC(=O)[C@H]2C[C@H]2C)C(C...,1,33.071700
12622,CCC1(C(=O)N[C@@H](C)C[C@H](C)NCC(=O)Nc2cc(C)on...,1,32.801300
13458,CCCC[C@@H](CNC(=O)c1cc(C)on1)NC(=O)[C@@H]1C[C@...,1,32.796800
...,...,...,...
70035,Cc1cc(C(=O)NC[C@@](C)(NC(=O)CCc2cn[nH]c2)C2CC2...,4,28.603450
71571,Cc1cc(CCC(=O)N(C)CC[C@H](NC(=O)[C@@H]2C[C@H]2C...,5,28.602840
54771,C[C@@H]1CCCC[C@@H]1OCCN(C)CCCN(C)C(=O)c1ccccn1,1,28.599300
53166,C[C@@H](NCCCCCNC(=O)OCc1ccccc1)C1CCC1,3,28.598600


In [24]:
# Write to csv
top_scorers.to_csv('out_data/score_3_hop/top_avg_scorers.csv')