# Big Data London Demo 2020-06-09
---

This notebook contains the code used in the big data london demo 2020

#### Imports

In [44]:
from py2neo import Graph
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import rdDepictor
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
from IPython.core.display import display, HTML
from rdkit.Chem import MACCSkeys
rdDepictor.SetPreferCoordGen(True)
IPythonConsole.ipython_useSVG = True

from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

### 1) Connect to the graph

In [2]:
graph_1 = Graph(host="localhost", password='mmpkg')

### 2) Create your in-memory graphs

In [48]:
# This one is needed for compound similarity and fragment->compound degree
graph_1.run("""
            CALL gds.graph.create('fragsimgraph', ['Compound', 'Fragment'], 'IS_FRAGMENT_OF')
            """).data()

ClientError: ProcedureCallFailed: Failed to invoke procedure `gds.graph.create`: Caused by: java.lang.IllegalArgumentException: A graph with name 'fragsimgraph' already exists.

In [8]:
# This one we'll use for compound->fragment degree
graph_1.run("""
            CALL gds.graph.create('compound_degree_graph', ['Fragment', 'Compound'], 
            {IS_FRAGMENT_OF: {
                    type: 'IS_FRAGMENT_OF',
                    orientation: 'REVERSE'
            }})
            """).data()

[{'graphName': 'fragsimgraph_c',
  'nodeProjection': {'Fragment': {'properties': {}, 'label': 'Fragment'},
   'Compound': {'properties': {}, 'label': 'Compound'}},
  'relationshipProjection': {'IS_FRAGMENT_OF': {'orientation': 'REVERSE',
    'aggregation': 'DEFAULT',
    'type': 'IS_FRAGMENT_OF',
    'properties': {}}},
  'nodeCount': 3433,
  'relationshipCount': 5047,
  'createMillis': 23}]

In [47]:
# Fragment only graph with all edges - we'll use this for pagerank
graph_1.run("""
            CALL gds.graph.create('fraggraph', ['Fragment'],['IS_FRAGMENT_OF','MMP_RULE_ENVIRONMENT'],
            {relationshipProperties:['avg']})
            """).data()

ClientError: ProcedureCallFailed: Failed to invoke procedure `gds.graph.create`: Caused by: java.lang.IllegalArgumentException: A graph with name 'fraggraph' already exists.

In [65]:
#Fragment only graph with fragment-fragment edges only - we'll use this to calculate fragment-fragment degree
graph_1.run("""
            CALL gds.graph.create('fraggraph_2', ['Fragment'],
            {MMP_RULE_ENVIRONMENT: {
                        type: 'MMP_RULE_ENVIRONMENT',
                        orientation: 'UNDIRECTED',
                        relationshipProperties:['avg']}}
                                 )
            """).data()

[{'graphName': 'fraggraph_2',
  'nodeProjection': {'Fragment': {'properties': {}, 'label': 'Fragment'}},
  'relationshipProjection': {'MMP_RULE_ENVIRONMENT': {'orientation': 'UNDIRECTED',
    'aggregation': 'DEFAULT',
    'type': 'MMP_RULE_ENVIRONMENT',
    'properties': {}}},
  'nodeCount': 2468,
  'relationshipCount': 63358,
  'createMillis': 14}]

In [56]:
#Clean up
pd.DataFrame(graph_1.run("""
            CALL gds.graph.list()
            """).data())

Unnamed: 0,degreeDistribution,graphName,nodeCount,nodeProjection,nodeQuery,relationshipCount,relationshipProjection,relationshipQuery
0,"{'p99': 16, 'min': 0, 'max': 36, 'mean': 1.470...",fragsimgraph,3433,"{'Fragment': {'properties': {}, 'label': 'Frag...",,5047,"{'IS_FRAGMENT_OF': {'orientation': 'NATURAL', ...",
1,"{'p99': 114, 'min': 0, 'max': 677, 'mean': 12....",fraggraph_2,2468,"{'Fragment': {'properties': {}, 'label': 'Frag...",,31679,{'MMP_RULE_ENVIRONMENT': {'orientation': 'NATU...,
2,"{'p99': 114, 'min': 0, 'max': 677, 'mean': 12....",fraggraph,2468,"{'Fragment': {'properties': {}, 'label': 'Frag...",,31679,"{'IS_FRAGMENT_OF': {'orientation': 'NATURAL', ...",
3,"{'p99': 12, 'min': 0, 'max': 142, 'mean': 1.47...",fragsimgraph_c,3433,"{'Fragment': {'properties': {}, 'label': 'Frag...",,5047,"{'IS_FRAGMENT_OF': {'orientation': 'REVERSE', ...",


In [58]:
#Clean up shit
graph_1.run("""
            CALL gds.graph.drop('fraggraph_2')
            """).data()

[{'graphName': 'fraggraph_2',
  'nodeProjection': {'Fragment': {'properties': {}, 'label': 'Fragment'}},
  'relationshipProjection': {'MMP_RULE_ENVIRONMENT': {'orientation': 'NATURAL',
    'aggregation': 'DEFAULT',
    'type': 'MMP_RULE_ENVIRONMENT',
    'properties': {'avg': {'property': 'avg',
      'defaultValue': nan,
      'aggregation': 'DEFAULT'}}}},
  'nodeQuery': None,
  'relationshipQuery': None,
  'nodeCount': 2468,
  'relationshipCount': 31679,
  'degreeDistribution': {'p99': 114,
   'min': 0,
   'max': 677,
   'mean': 12.835899513776337,
   'p90': 35,
   'p50': 6,
   'p999': 201,
   'p95': 48,
   'p75': 15}}]

### 3a) Node Similarity
##### Find me compounds that are similar to each other by fragments

In [4]:
#Fetch data
node_sim_data = graph_1.run("""
            CALL gds.nodeSimilarity.stream('fragsimgraph')
            YIELD node1, node2, similarity
            RETURN gds.util.asNode(node1).smiles AS Compound1, 
                   gds.util.asNode(node2).smiles AS Compound2, 
                   gds.util.asNode(node1).compoundid as Compound_id1,
                   gds.util.asNode(node2).compoundid as Compound_id2,
                   similarity
            ORDER BY similarity DESCENDING, Compound1, Compound2
            """).data()
#Convert to pandas
node_sim_df = pd.DataFrame(node_sim_data)
#Add RDKIT molecules
PandasTools.AddMoleculeColumnToFrame(node_sim_df, 'Compound1','Compound1')
PandasTools.AddMoleculeColumnToFrame(node_sim_df, 'Compound2','Compound2')

In [5]:
node_sim_df.head()

Unnamed: 0,Compound1,Compound2,Compound_id1,Compound_id2,similarity
0,,,929,950,1.0
1,,,950,929,1.0
2,,,871,856,1.0
3,,,871,869,1.0
4,,,578,580,1.0


In [43]:
#Write
graph_1.run("""
            CALL gds.nodeSimilarity.write('fragsimgraph',
                {
                writeRelationshipType: 'SIMILAR_BY_FRAGMENT',
                writeProperty: 'similarity_score'
                })
                YIELD nodesCompared, relationshipsWritten
                """).data()

[{'nodesCompared': 965, 'relationshipsWritten': 6325}]

### 3b) Degree - Number of fragments per compound node

In [32]:
# Number of nodes per fragment node
data = graph_1.run("""
                CALL gds.alpha.degree.write('fragsimgraph',{writeProperty: 'fragment_degree'})
                YIELD nodes, writeProperty
                """).data()

In [31]:
# Number of fragments per compound node
data = graph_1.run("""
                CALL gds.alpha.degree.write('compound_degree_graph',{writeProperty: 'compound_degree'})
                YIELD nodes, writeProperty
                """).data()

In [66]:
# Number of fragments per fragment node
data = graph_1.run("""
                CALL gds.alpha.degree.write('fraggraph_2',{writeProperty: 'frag_frag_degree'})
                YIELD nodes, writeProperty
                """).data()

### 3c) Let's compare to other similarity metrics

In [6]:
#Fetch data
node_sim_data = graph_1.run("""MATCH (n:Compound)-[m]-(p)
                               WHERE n.fragment_degree>=15
                               AND m.similarity_score IS NOT null
                               AND startnode(m) = n
                               RETURN
                               DISTINCT n.compoundid AS cid1, 
                                        n.smiles AS Compound1, 
                                        n.fragment_degree,
                                        p.compoundid AS cid2,
                                        p.smiles AS Compound2,
                                        p.fragment_degree,
                                        m.similarity_score
                              ORDER BY m.similarity_score DESC""").data()
#Convert to pandas
node_sim_df = pd.DataFrame(node_sim_data)

In [7]:
#Add RDKIT molecules
PandasTools.AddMoleculeColumnToFrame(node_sim_df, 'Compound1','Compound1')
PandasTools.AddMoleculeColumnToFrame(node_sim_df, 'Compound2','Compound2')
#Calculate Morgan fingerprints for comparison
fps_c1 = [AllChem.GetMorganFingerprintAsBitVect(x,radius=2, nBits=1024) for x in node_sim_df['Compound1']]
fps_c2 = [AllChem.GetMorganFingerprintAsBitVect(x,radius=2, nBits=1024) for x in node_sim_df['Compound2']]
node_sim_df['morgan_1024_r2_sim'] = [DataStructs.FingerprintSimilarity(x, y, metric=DataStructs.TanimotoSimilarity) for x, y in zip(fps_c1, fps_c2)]
#Calculate MACCS fingerprint similarity for comparison
maccs1 = [MACCSkeys.GenMACCSKeys(x) for x in node_sim_df['Compound1'].tolist()]
maccs2 = [MACCSkeys.GenMACCSKeys(x) for x in node_sim_df['Compound2'].tolist()]
node_sim_df['maccs_sim'] = [DataStructs.FingerprintSimilarity(x, y, metric=DataStructs.TanimotoSimilarity) for x, y in zip(maccs1, maccs2)]

In [None]:
node_sim_df.sort_values(by=['m.similarity_score'], ascending=False).head(10)

### 4a) Pagerank
##### Find me fragments which are important, based on their place in the fragment network, weighted by avg MMP change - maybe these are the ones that get changed the most frequently?

In [19]:
pagerank_w = graph_1.run("""
                CALL gds.pageRank.write('fraggraph', {relationshipWeightProperty:'avg', writeProperty:'fragment_pageRank'})
                """).data()

In [20]:
pagerank_uw = graph_1.run("""
                CALL gds.pageRank.write('fraggraph', {writeProperty:'fragment_pageRank_unweighted'})
                """).data()

In [67]:
page_data = graph_1.run("""
                MATCH (n:Fragment)
                WHERE EXISTS(n.fragment_pageRank)
                RETURN n.smiles as smiles, n.fragment_pageRank as fragment_pageRank, n.fragment_pageRank_unweighted, n.compound_degree, n.frag_frag_degree,
                       n.unique_fragment_count
                ORDER by n.fragment_pageRank DESC
                """).data()

In [68]:
#Convert to pandas
page_df = pd.DataFrame(page_data)
#Add RDKIT molecules
PandasTools.AddMoleculeColumnToFrame(page_df, 'smiles','Fragment')

In [69]:
page_df.head()

Unnamed: 0,fragment_pageRank,n.compound_degree,n.frag_frag_degree,n.fragment_pageRank_unweighted,n.unique_fragment_count,smiles,Fragment
0,4.114433,37.0,458.0,3.780301,,[*:1]c1ccccc1,
1,3.499296,142.0,815.0,7.118977,,[*:1][H],
2,2.378116,9.0,254.0,2.102223,,[*:1]NC(C)=O,
3,2.195371,1.0,48.0,0.635886,,[*:1]c1ccccc1N(C)C,
4,2.032635,3.0,36.0,0.329875,,[*:1]NS(C)(=O)=O,


### 5) Link prediction

* It's not useful to predict links between compounds here... i.e. which compounds will be MMP of each other... compounds are either MMP or they are not...

* Instead what is more useful is to predict whether FRAGMENTS will be related (i.e. will changing one fragment for another change a property I am interested in?)

##### We can use Adamic Agar for this
* Adamic Agar is a measure used to compute the closeness of nodes based on their shared neighbors.
* This will tell us the likelyhood of a relationship between two fragments - "if you have this in your molecule have you tried this?"
* What it does not tell us is the size of that property or even what property... to do this we'll need more advanced algos that can account for more features of nodes and relationships.

In [49]:
#Fetch data
aa_data = graph_1.run("""MATCH (f1:Fragment), (f2:Fragment)
                               WHERE NOT (f1)-[:MMP_RULE_ENVIRONMENT]-(f2)
                               AND f1.compound_degree > 10
                               AND f2.compound_degree > 10
                               RETURN gds.alpha.linkprediction.adamicAdar(f1, f2) AS score, 
                                      f1.smiles, 
                                      f2.smiles, 
                                      f1.compound_degree
                               ORDER BY score DESC""").data()
#Convert to pandas
aa_df = pd.DataFrame(aa_data)
#Add RDKIT molecules
PandasTools.AddMoleculeColumnToFrame(aa_df, 'f1.smiles','Fragment1')
PandasTools.AddMoleculeColumnToFrame(aa_df, 'f2.smiles','Fragment2')

In [50]:
aa_df.head(20)

Unnamed: 0,f1.compound_degree,f1.smiles,f2.smiles,score,Fragment1,Fragment2
0,20.0,[*:1]C([*:2])C,[*:1]C,4.857417,,
1,136.0,[*:1]C,[*:1]C([*:2])C,4.857417,,
2,23.0,[*:1]O[*:2],[*:1]OC,4.496131,,
3,50.0,[*:1]OC,[*:1]O[*:2],4.496131,,
4,37.0,[*:1]CC,[*:1]OCC,4.065172,,
5,14.0,[*:1]OCC,[*:1]CC,4.065172,,
6,14.0,[*:1]Nc1ccc(F)cc1F,[*:1]c1ccc(F)cc1F,3.886339,,
7,15.0,[*:1]c1ccc(F)cc1F,[*:1]Nc1ccc(F)cc1F,3.886339,,
8,64.0,[*:1]F,[*:1]c1ccc(F)cc1,3.360184,,
9,11.0,[*:1]c1ccc(F)cc1,[*:1]F,3.360184,,


### 6) Louvain

### x) Plotting

In [130]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource
from bokeh.models.tools import HoverTool
from sklearn.linear_model import LinearRegression
from bokeh.models import Slope
output_notebook()

In [131]:
node_sim_df2 = node_sim_df[['m.similarity_score','maccs_sim','cid1','cid2']]

In [133]:
p = figure()
p.circle(x='m.similarity_score', y='maccs_sim',
         source=node_sim_df2,
         size=5, color='blue')
p.xaxis.axis_label = "node similarity"
p.yaxis.axis_label = "maccs similarity"
hover = HoverTool()
hover.tooltips=[
    ('molecule1', '@cid1'),
    ('molecule2', '@cid2'),
    ('node_sim','@m.similarity_score'),
    ('maccs_sim','@maccs_sim'),
#    ('fragment_pageRank', '@fragment_pageRank')
]

p.add_tools(hover)

show(p)