In [6]:
import pandas as pd
from py2neo import Graph, GraphService
from neo4j import GraphDatabase

In [9]:
graph_staging = Graph('bolt://34.67.212.125:7687', auth=('neo4j', 'lifelike-stg-2021'))
# graph_dtu = Graph('bolt+s://kg.biosustain.dtu.dk:7687', auth=('robin', 'kTxu$drJ%3C3^cHk'))
# graph = Graph('bolt://localhost:7687', auth=('neo4j', 'rcai'))

# Delete Mesh Synonyms with comma, excluding chemicals (LL-2974)

In [None]:
query = """
match(n:TopicalDescriptor)-[:HAS_TREENUMBER]-(t) where left(t.id, 1) in ['A', 'C', 'F', 'G'] 
with distinct n optional match (n)-[:HAS_SYNONYM]-(s) where not s.name contains ',' 
with n, collect(s.name) as syns
return n.id, n.name, syns
"""
df = graph.run(query).to_data_frame()
df.to_excel('/Users/rcai/data/notebook/mesh/meshACFG_synons.xlsx', index=False)

There are 30 terms in the above category (A, C, F, G) that don't have a synonym without ','.  None of them were involved in the LMDB search.

For diseases, 1679/4970 has no terms without ','.  1562/1679 mapped to a topicaldescription with treenumber starts with 'C' (disease category).  

To Keep the synonyms in Mesh and LMDB consistent, remove all synonyms contain ',' in the following category:
1. Tree number starts with letter in ['A', 'C', 'F', 'G']: all ',' containing synonyms were only linked to mesh nodes
2. is Mesh disease: there are 18 synonyms with ',' that linked to a gene fullname.  Since gene fullname search is never used, it is ok to remove the synonyms


### Run the following query
```
match(n:TopicalDescriptor)-[:HAS_TREENUMBER]-(t) where left(t.id, 1) in ['A', 'C', 'F', 'G'] 
with distinct n match (n)-[:HAS_SYNONYM]-(s) where s.name contains ',' 
detach delete s; 
```
Result: Deleted 43678 nodes, deleted 43995 relationships
    
    
```
match (n:Disease)-[:HAS_SYNONYM]-(s) where s.name contains ',' 
detach delete s
```
Deleted 6054 nodes, deleted 6150 relationships

# Remove non-chemical single letter synonyms and Gene full names from Synonym (LL-3031)

#### Delete single-letter synonym relationships to non-chemicals
Total 498211 relationships removed
```
call apoc.periodic.iterate(
"match(n:Synonym)-[r:HAS_SYNONYM]-(x) where size(n.name) = 1 and not 'Chemical' in labels(x) return r",
"delete r",
{batchSize:5000}
);
```

#### Delete gene synonyms that is gene fullname
Total 23783668 reltationships removed
```
call apoc.periodic.iterate(
"match (n:Gene:db_NCBI)-[r:HAS_SYNONYM]-(s) where n.full_name=s.name return r",
"delete r",
{batchSize:5000}
)
```

#### Remove orphan synonyms
Removed 4239521synonym nodes
```
call apoc.periodic.iterate(
"match (n:Synonym) where not (n)-[]-() return n",
"delete n",
{batchSize:5000}
)
```

In [10]:
# verification
query = """
match(:Taxonomy {id:'9606'})-[:HAS_TAXONOMY]-(n:Gene)-[:HAS_SYNONYM]-(s) return n.name, n.full_name, s.name
"""
df = graph_staging.run(query).to_data_frame()
len(df)
df.to_excel("/Users/rcai/data/notebook/gene/human_gene_syns.xlsx", index=False)

131759