# Docking

Analysis of flexible docking calculations with GNINA with the Vina scoring function.

In [61]:
import pandas as pd
import numpy as np

## Data

In [29]:
rmsd_file = "../carlos_cd/rmsds.csv"

In [35]:
df = pd.read_csv(rmsd_file)
df

Unnamed: 0,pocket,protein,ligand,rank,rmsd,obrmsd,flexrmsd,flexobrmsd,fmaxrmsd,score
0,AA2AR,3EML,3QAK,0,2.22358,2.223580,0.64089,0.640886,1.09735,-9.13664
1,AA2AR,3EML,3QAK,10,3.25554,3.255540,1.92520,1.925200,3.30490,-12.32127
2,AA2AR,3EML,3QAK,11,2.51549,2.515490,2.09211,2.092110,3.74144,-12.21410
3,AA2AR,3EML,3QAK,12,6.50215,6.502150,1.49297,1.492970,3.03461,-12.17950
4,AA2AR,3EML,3QAK,13,10.29577,10.295800,1.20995,1.209950,2.57418,-12.05803
...,...,...,...,...,...,...,...,...,...,...
107254,XIAP,5C83,5C3K,0,0.64921,0.649212,0.87489,0.874893,1.61106,-6.30846
107255,XIAP,5C83,5C7B,0,0.65275,0.652747,0.30845,0.308446,0.44257,-6.27960
107256,XIAP,5C84,1TFQ,0,1.62995,1.629950,0.89805,0.898053,1.63567,-7.18336
107257,XIAP,5C84,4HY0,0,1.28057,1.280570,0.40234,0.402339,0.59178,-6.82557


In [36]:
# Get tuples from [pocket, ligand, protein] and count unique tuples
# Check that the number of unique tuples equals the total number protein-ligand pairs
allsystems = pd.unique(df[["pocket", "ligand", "protein"]].apply(tuple,axis="columns"))
assert len(allsystems) == 7970

In [37]:
df_clean = df.dropna()
systems = pd.unique(df_clean[["pocket", "ligand", "protein"]].apply(tuple,axis="columns"))
len(systems)

7921

In [41]:
diff = set(allsystems) - set(systems)
diff

{('ACES', '1ACJ', '1JJB'),
 ('ACES', '1ZGB', '1JJB'),
 ('ACES', '2CMF', '1JJB'),
 ('CDK2', '3IG7', '3QQJ'),
 ('FA10', '2FZZ', '2XBV'),
 ('FA10', '2RA0', '1IQE'),
 ('FA10', '2Y82', '2XBV'),
 ('FA10', '3KQB', '1IQE'),
 ('HIVPR', '3O9H', '1W5V'),
 ('IGF1R', '1JQH', '5FXR'),
 ('IGF1R', '2OJ9', '5FXR'),
 ('IGF1R', '2ZM3', '5FXR'),
 ('IGF1R', '3LVP', '5FXR'),
 ('IGF1R', '3NW6', '5FXR'),
 ('IGF1R', '3NW7', '5FXR'),
 ('JAK2', '4D0W', '4F08'),
 ('JAK2', '4E4M', '4F08'),
 ('JAK2', '5CF6', '4F08'),
 ('KIF11', '1X88', '4BXN'),
 ('KIF11', '2IEH', '4BXN'),
 ('KIF11', '2X7D', '4BXN'),
 ('KIF11', '3K3B', '4BXN'),
 ('MK01', '4FV2', '4GSB'),
 ('MK01', '4ZZM', '4GSB'),
 ('MK01', '5LCJ', '4GSB'),
 ('MK01', '5NHV', '4GSB'),
 ('MK10', '2G01', '1UKI'),
 ('MK10', '3ELJ', '1UKI'),
 ('MK10', '3RTP', '1UKI'),
 ('MK10', '4L7F', '4HYS'),
 ('NRAM', '1A4G', '1B9V'),
 ('NRAM', '1A4G', '1VCJ'),
 ('NRAM', '1A4Q', '1B9V'),
 ('NRAM', '1A4Q', '1VCJ'),
 ('NRAM', '1B9S', '1B9V'),
 ('NRAM', '1B9T', '1B9V'),
 ('NRAM', '1INF',

In [55]:
def show_system(t):
    return df[(df.pocket == t[0]) & (df.ligand == t[1]) & (df.protein == t[2])]

In [56]:
show_system(('CDK2', '3IG7', '3QQJ'))

Unnamed: 0,pocket,protein,ligand,rank,rmsd,obrmsd,flexrmsd,flexobrmsd,fmaxrmsd,score
29482,CDK2,3QQJ,3IG7,0,2.48215,2.48215,,,-1.0,-8.0795


In [57]:
# flexobrmsd is non-null while flexrmsd is NaN
# For some (large) systems, spyrmsd process is killed after a while...
# Use flexobrmsd only
show_system(('NRAM', '1A4G', '1B9V'))

Unnamed: 0,pocket,protein,ligand,rank,rmsd,obrmsd,flexrmsd,flexobrmsd,fmaxrmsd,score
104493,NRAM,1B9V,1A4G,0,0.79267,0.792673,,0.681253,1.25573,-7.91753


In [54]:
# Cleanup firther by removing the flexrmsd colum
# It contains some spurious NaNs since spyrmsd sometimes gets killed (for very large systems)
# Using only obrms result for flexible side chain RMSD we get the same systems than
# in the GNINA 1.0 paper, as it should be.
df_clean = df.drop(columns=["flexrmsd"]).dropna()
systems = pd.unique(df_clean2[["pocket", "ligand", "protein"]].apply(tuple,axis="columns"))
diff = set(allsystems) - set(systems)
diff

{('ACES', '1ACJ', '1JJB'),
 ('ACES', '1ZGB', '1JJB'),
 ('ACES', '2CMF', '1JJB'),
 ('CDK2', '3IG7', '3QQJ'),
 ('IGF1R', '1JQH', '5FXR'),
 ('IGF1R', '2OJ9', '5FXR'),
 ('IGF1R', '2ZM3', '5FXR'),
 ('IGF1R', '3LVP', '5FXR'),
 ('IGF1R', '3NW6', '5FXR'),
 ('IGF1R', '3NW7', '5FXR'),
 ('JAK2', '4D0W', '4F08'),
 ('JAK2', '4E4M', '4F08'),
 ('JAK2', '5CF6', '4F08'),
 ('MK01', '4FV2', '4GSB'),
 ('MK01', '4ZZM', '4GSB'),
 ('MK01', '5LCJ', '4GSB'),
 ('MK01', '5NHV', '4GSB'),
 ('MK10', '2G01', '1UKI'),
 ('MK10', '3ELJ', '1UKI'),
 ('MK10', '3RTP', '1UKI'),
 ('MK10', '4L7F', '4HYS'),
 ('SRC', '3DQX', '3UQG'),
 ('SRC', '5D10', '3UQG'),
 ('SRC', '5J5S', '3UQG')}

In [59]:
# System discarded from the GNINA1.0 because of broken bonds
# flexobrmsd is inf (while)
show_system(("KIF11","3K3B","4BXN"))

Unnamed: 0,pocket,protein,ligand,rank,rmsd,obrmsd,flexrmsd,flexobrmsd,fmaxrmsd,score
87499,KIF11,4BXN,3K3B,0,2.37879,2.37879,,inf,1.82213,-10.8373
87500,KIF11,4BXN,3K3B,10,2.60657,2.60657,,inf,3.69682,-11.24199
87501,KIF11,4BXN,3K3B,11,6.76467,6.76467,,inf,4.5841,-11.13477
87502,KIF11,4BXN,3K3B,12,4.63203,4.63203,,inf,4.06148,-11.12297
87503,KIF11,4BXN,3K3B,13,8.18852,8.18852,,inf,4.54174,-11.09078
87504,KIF11,4BXN,3K3B,14,8.14728,8.14728,,inf,4.17728,-11.08464
87505,KIF11,4BXN,3K3B,15,4.39169,4.39169,,inf,2.10791,-11.02752
87506,KIF11,4BXN,3K3B,16,3.43046,3.43046,,inf,3.2535,-10.98312
87507,KIF11,4BXN,3K3B,17,2.59429,2.59429,,inf,4.53968,-10.97631
87508,KIF11,4BXN,3K3B,18,1.88421,1.88421,,inf,3.83731,-10.90196


In [63]:
set(pd.unique(df_clean.loc[df_clean.flexobrmsd == np.inf, ["pocket", "ligand", "protein"]].apply(tuple,axis="columns")))

{('FA10', '2FZZ', '2XBV'),
 ('FA10', '2RA0', '1IQE'),
 ('FA10', '2Y82', '2XBV'),
 ('FA10', '3KQB', '1IQE'),
 ('GCR', '3CLD', '3MNP'),
 ('KIF11', '1X88', '4BXN'),
 ('KIF11', '2IEH', '4BXN'),
 ('KIF11', '2X7D', '4BXN'),
 ('KIF11', '3K3B', '4BXN'),
 ('LKHA4', '3FTZ', '4RSY'),
 ('LKHA4', '3FUJ', '3FHE')}