In [10]:
import pandas as pd
import xml.etree.ElementTree as et
from itertools import chain
from collections import defaultdict
from pathlib import Path
import plotly.express as px
from uuid import uuid4
import sqlite3
data_dir = Path('../pathways/raw')
db_path = Path('../db/taxonomy.db')

In [35]:
x_range = 600
y_range = 1000
node_cols = ['id', 'name', 'type', 'x', 'y', 'pathway']
edge_cols = ['id', 'source', 'target', 'pathway']


def get_node_data(entry, pathway):
    try:
        tmp = [x for x in entry if x.tag == 'graphics'][0].attrib
        return (
            entry.attrib['id'], tmp['name'], tmp['type'], int(tmp['x']), y_range - int(tmp['y']), pathway
        )
    except KeyError:
        print(f"{entry.attrib} node error")
        return None


def get_edge_data(element, id_map, pathway):
    try:
        subtype = [x for x in element if x.tag == 'subtype'][0]
        source_1 = element.attrib['entry1']
        target_1 = subtype.attrib['value']
        source_2 = target_1
        target_2 = element.attrib['entry2']
        return [
            (str(uuid4()), id_map[source_1], id_map[target_1], pathway), (str(uuid4()), id_map[source_2], id_map[target_2], pathway)
        ]
    except (KeyError, IndexError):
        print(f"{element.attrib} edge error")


def create_table(name, schema, cursor, connection):
    cursor.execute(f"DROP TABLE IF EXISTS {name};")
    connection.commit()
    cursor.execute(schema)
    connection.commit()

In [37]:
node_table_name = 'pathway_nodes'
node_schema = f"""
    CREATE TABLE IF NOT EXISTS {node_table_name} (
        id TEXT PRIMARY KEY,
        name TEXT,
        type TEXT,
        x INT,
        y INT,
        pathway INT
    )
"""
node_q = f"""
    INSERT INTO {node_table_name} ({",".join(node_cols)}) VALUES (?, ?, ?, ?, ?, ?)
"""

edge_table_name = 'pathway_edges'
edge_schema = f"""
    CREATE TABLE IF NOT EXISTS {edge_table_name} (
        id TEXT PRIMARY KEY,
        source TEXT,
        target TEXT,
        pathway INT,
        FOREIGN KEY (source) REFERENCES pathway_nodes(id),
        FOREIGN KEY (target) REFERENCES pathway_nodes(id)
    )
"""
edge_q = f"""
    INSERT INTO {edge_table_name} ({",".join(edge_cols)}) VALUES (?, ?, ?, ?)
"""

In [38]:
with sqlite3.connect(db_path, autocommit=False) as connection:
    cursor = connection.cursor()
    create_table(node_table_name, node_schema, cursor, connection)
    create_table(edge_table_name, edge_schema, cursor, connection)
    for f_path in [ x for x in data_dir.iterdir() if x.suffix.endswith('kgml')]:
        pathway = int(f_path.stem)
        print(pathway)
        tree = et.parse(f_path)
        root = tree.getroot()
        nodes = [
            x for x in 
            [ get_node_data(x, pathway) for x in root if x.tag == 'entry' ]
            if x is not None
        ]
        id_map = {
            x[0]: str(uuid4())
            for x in nodes
        }
        # change id to uuid
        edges = list(chain.from_iterable([
            x for x in 
            [ get_edge_data(x, id_map, pathway) for x in root if x.tag == 'relation' ]
            if x is not None
        ]))
        nodes = [
            (id_map[n[0]],) + n[1:]
            for n in nodes
        ]
        cursor.executemany(node_q, nodes)
        connection.commit()
        cursor.executemany(edge_q, edges)
        connection.commit()



630
260
364
333
626
61
523
470
20
562
515
{'id': '39', 'name': 'ec:2.4.3.6', 'type': 'enzyme', 'link': 'https://www.kegg.jp/dbget-bin/www_bget?2.4.3.6'} node error
{'id': '42', 'name': 'ec:2.4.1.109', 'type': 'enzyme', 'reaction': 'rn:R07620', 'link': 'https://www.kegg.jp/dbget-bin/www_bget?2.4.1.109'} node error
{'id': '44', 'name': 'ec:2.4.1.135', 'type': 'enzyme', 'link': 'https://www.kegg.jp/dbget-bin/www_bget?2.4.1.135'} node error
{'id': '46', 'name': 'ec:2.4.1.109', 'type': 'enzyme', 'reaction': 'rn:R07620', 'link': 'https://www.kegg.jp/dbget-bin/www_bget?2.4.1.109'} node error
{'id': '49', 'name': 'ec:2.4.1.152', 'type': 'enzyme', 'link': 'https://www.kegg.jp/dbget-bin/www_bget?2.4.1.152'} node error
{'id': '51', 'name': 'ec:2.4.1.109', 'type': 'enzyme', 'reaction': 'rn:R07620', 'link': 'https://www.kegg.jp/dbget-bin/www_bget?2.4.1.109'} node error
{'id': '68', 'name': 'ec:2.4.3.6', 'type': 'enzyme', 'link': 'https://www.kegg.jp/dbget-bin/www_bget?2.4.3.6'} node error
{'id': '7

In [29]:

with sqlite3.connect(db_path, autocommit=False) as connection:
    cursor = connection.cursor()
    cursor.executemany(edge_q, edges)
    connection.commit()