# Import taxonomy dataset into Neo4j

In [1]:
import pandas as pd
from graphdatascience import GraphDataScience

host = "bolt://44.193.28.203:7687"
user = "neo4j"
password = "combatants-coordinates-tugs"
gds = GraphDataScience(host, auth=(user, password))

def batch_import(query, params_df, batch_size=25_000):
    params = params_df.to_dict('records')
    for i in range(0, len(params), batch_size):
        batch_data = params[i:i+batch_size]
        gds.run_cypher(query, {'data': batch_data})

Download the [new taxonomy dump files](https://ncbiinsights.ncbi.nlm.nih.gov/2018/02/22/new-taxonomy-files-available-with-lineage-type-and-host-information/) and move them in the same folder where this Jupyter notebook is located 

*Make sure to install the APOC plugin in Neo4j*

# Import nodes

	tax_id					          -- node id in GenBank taxonomy database
 	parent tax_id				      -- parent node id in GenBank taxonomy database
 	rank					          -- rank of this node (superkingdom, kingdom, ...) 
 	embl code				          -- locus-name prefix; not unique
 	division id				          -- see division.dmp file
 	inherited div flag  (1 or 0)      -- 1 if node inherits division from parent
 	genetic code id				      -- see gencode.dmp file
 	inherited GC  flag  (1 or 0)      -- 1 if node inherits genetic code from parent
 	mitochondrial genetic code id     -- see gencode.dmp file
 	inherited MGC flag  (1 or 0)      -- 1 if node inherits mitochondrial gencode from parent
 	GenBank hidden flag (1 or 0)      -- 1 if name is suppressed in GenBank entry lineage
 	hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet
 	comments				          -- free-text comments and citations
    plastid genetic code id           -- see gencode.dmp file
    inherited PGC flag  (1 or 0)      -- 1 if node inherits plastid gencode from parent
	specified_species			      -- 1 if species in the node's lineage has formal name
    hydrogenosome genetic code id     -- see gencode.dmp file
    inherited HGC flag  (1 or 0)      -- 1 if node inherits hydrogenosome gencode from parent

In [2]:
nodes = pd.read_csv('nodes.dmp', delimiter="|", header=None, quotechar='\t')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
node_columns = ["tax_id", "parent tax_id", "rank", "embl code", "division id", "inherited div flag",
                "genetic code id", "inherited GC flag", "mitochondrial genetic code id", "inherited MGC flag",
                "GenBank hidden flag", "hidden subtree root flag", "comments", "plastid genetic code id",
                "inherited PGC flag", "specified_species", "hydrogenosome genetic code id", "inherited HGC flag", 
                "None"]
nodes.columns = node_columns

In [4]:
nodes.head()

Unnamed: 0,tax_id,parent tax_id,rank,embl code,division id,inherited div flag,genetic code id,inherited GC flag,mitochondrial genetic code id,inherited MGC flag,GenBank hidden flag,hidden subtree root flag,comments,plastid genetic code id,inherited PGC flag,specified_species,hydrogenosome genetic code id,inherited HGC flag,None
0,1,1,no rank,,8,0,1,0,0,0,0,0,,,,0,0.0,0,
1,2,131567,superkingdom,,0,0,11,0,0,0,0,0,,,,0,0.0,1,
2,6,335928,genus,,0,1,11,1,0,1,0,0,,,,0,0.0,1,
3,7,6,species,AC,0,1,11,1,0,1,1,0,,,,1,0.0,1,
4,9,32199,species,BA,0,1,11,1,0,1,1,0,,,,1,0.0,1,


In [5]:
len(nodes)

2425415

In [6]:
gds.run_cypher("""
CREATE CONSTRAINT IF NOT EXISTS FOR (n:Node) REQUIRE n.id IS UNIQUE;
""")

In [7]:
create_nodes_query = """
UNWIND $data AS row
CREATE (n:Node {id: row.tax_id})
SET n += apoc.map.clean(row, ['tax_id', 'parent tax_id', 'rank'], ["", gds.util.NaN()])
WITH n, row.rank AS rank
CALL apoc.create.addLabels(n, [apoc.text.capitalize(rank)]) YIELD node
RETURN distinct 'done'
"""

batch_import(create_nodes_query, nodes)

In [8]:
parent_relation_import_query ="""
UNWIND $data AS row
MATCH (child:Node {id:row.tax_id})
MATCH (parent:Node {id:row.`parent tax_id`})
MERGE (child)-[:PARENT]->(parent)
"""

batch_import(parent_relation_import_query, nodes[['tax_id', 'parent tax_id']])

# Import names

names.dmp
---------
Taxonomy names file has these fields:

	tax_id					-- the id of node associated with this name
	name_txt				-- name itself
	unique name				-- the unique variant of this name if name not unique
	name class				-- (synonym, common name, ...)

In [9]:
names = pd.read_csv('names.dmp', delimiter="|", header=None, quotechar='\t')

In [10]:
name_columns = ['tax_id', 'name_txt', 'unique_name', 'name_class', 'none']
names.columns = name_columns
names.head()

Unnamed: 0,tax_id,name_txt,unique_name,name_class,none
0,1,all,,synonym,
1,1,root,,scientific name,
2,2,Bacteria,Bacteria <bacteria>,scientific name,
3,2,bacteria,,blast name,
4,2,eubacteria,,genbank common name,


In [11]:
import_names_query = """
UNWIND $data AS row
MATCH (n:Node {id: row.tax_id})
SET n.name = row.name_txt
"""

batch_import(import_names_query, names[['tax_id', 'name_txt']])

# Import hosts 

host.dmp
--------
Theoretical host for organism file fields:

	tax_id					-- node id
	potential_hosts				-- theoretical host list separated by comma ','

In [12]:
hosts = pd.read_csv('host.dmp', delimiter="|", header=None, quotechar='\t')

In [13]:
hosts.columns = ['tax_id', 'potential_hosts', 'none']
hosts.head()

Unnamed: 0,tax_id,potential_hosts,none
0,562,"bacteria,vertebrates",
1,666,bacteria,
2,686,bacteria,
3,1280,human,
4,1307,bacteria,


In [14]:
gds.run_cypher("""
CREATE INDEX IF NOT EXISTS FOR (n:Node) ON (n.name)
""")

In [None]:
import_hosts_query = """
UNWIND $data AS row
MATCH (n:Node {id: row.tax_id})
UNWIND split(row.potential_hosts, ',') AS host
MATCH (h:Node {name: host})
MERGE (n)-[:POTENTIAL_HOST]->(h)
"""
batch_import(import_hosts_query, hosts)

# import citations

citations.dmp
-------------
Citations file fields:

	cit_id				-- the unique id of citation
	cit_key				-- citation key
    medline_id          -- unique id in MedLine database (0 if not in MedLine)
	pubmed_id		    -- unique id in PubMed database (0 if not in PubMed)
	url					-- URL associated with citation
	text				-- any text (usually article name and authors)
						-- The following characters are escaped in this text by a backslash:
						-- newline (appear as "\n"),
						-- tab character ("\t"),
						-- double quotes ('\"'),
						-- backslash character ("\\").
	taxid_list			-- list of node ids separated by a single space

In [None]:
citations = pd.read_csv('citations.dmp', delimiter="|", header=None, quotechar='\t')

In [None]:
citations.columns = ['cit_id', 'cit_key', 'medline_id', 'pubmed_id', 'url', 'text', 'taxid_list', 'none']

In [None]:
citations

In [None]:
gds.run_cypher('CREATE CONSTRAINT FOR (c:Citation) REQUIRE c.id IS UNIQUE;')

In [None]:
import_citations_query = """
UNWIND $data AS row
MERGE (c:Citation {id: row.cit_id})
SET c += apoc.map.clean(row, ['cit_id', 'none', 'taxid_list'], ['0', gds.util.NaN()])
WITH c, row.taxid_list AS taxid_list
WHERE toString(taxid_list) <> "NaN"
UNWIND split(toString(taxid_list), ' ') AS tax_id
WITH c, tax_id
MATCH (n:Node {id: toInteger(tax_id)})
MERGE (c)-[:MENTIONS]->(n);
"""

batch_import(import_citations_query, citations, 5000)