In [8]:
import requests
import pandas as pd
from io import StringIO
import re
from collections import defaultdict
import sqlite3
from pathlib import Path
from uuid import uuid4

In [3]:
db_path = Path('../db/taxonomy.db')

def create_table(name, schema, cursor, connection):
    cursor.execute(f"DROP TABLE IF EXISTS {name};")
    connection.commit()
    cursor.execute(schema)
    connection.commit()

In [6]:
# parse the superpathway list, keeping only the first section (i.e. metabolism)
# from brite 08901
# get superpathway list
url = 'https://rest.kegg.jp/get/br:br08901'
resp_str = requests.get(url).text

# # # cached
# with open('superpathway_cached.txt') as f:
#     resp_str = f.read()

p = "\nA.+?\n(.+?)\nA"
matches = re.findall(p, resp_str, re.DOTALL)
match_section = matches[0].split('\n')
pathway_mapping = []
curr_b = ''
p2 = re.compile("^([ABC])\\s+?(\\S.+?)$")
p3 = re.compile("^([0-9]+?)\\s+?(\\S.+?)$")
for row in match_section:
    k, v = re.match(p2, row).groups(0)
    if k == 'B':
        curr_b = v
    if k == 'C':
        n, m = re.match(p3, v).groups(0)
        pathway_mapping.append((int(n), m, curr_b))


In [14]:
sp_names = list(set(x[2] for x in pathway_mapping))
sp_mapping = {x: str(uuid4()) for x in sp_names}

sp = [(sp_mapping[x], x) for x in sp_names]
pathways = [
    (x[0], x[1], sp_mapping[x[2]])
    for x in pathway_mapping
]

In [12]:
sp_cols = ['id', 'name']
sp_table_name = 'superpathways'
sp_schema = f"""
    CREATE TABLE IF NOT EXISTS {sp_table_name} (
        id TEXT PRIMARY KEY,
        name TEXT
    )
"""
sp_q = f"""
    INSERT INTO {sp_table_name} ({",".join(sp_cols)}) VALUES ({",".join(['?' for _ in sp_cols])})

"""

pathways_cols = ['id', 'name', 'superpathway']
pathway_table_name = 'pathway_superpathways'
pathway_schema = f"""
    CREATE TABLE IF NOT EXISTS {pathway_table_name} (
        id INT PRIMARY KEY,
        name TEXT,
        superpathway TEXT,
        FOREIGN KEY (superpathway) REFERENCES {sp_table_name}(id)
    )
"""
pathway_q = f"""
    INSERT INTO {pathway_table_name} ({",".join(pathways_cols)}) VALUES ({",".join(['?' for _ in pathways_cols])})
"""

In [15]:
with sqlite3.connect(db_path, autocommit=False) as connection:
    cursor = connection.cursor()
    create_table(sp_table_name, sp_schema, cursor, connection)
    create_table(pathway_table_name, pathway_schema, cursor, connection)
    cursor.executemany(sp_q, sp)
    connection.commit()
    cursor.executemany(pathway_q, pathways)
    connection.commit()
    

In [None]:
# pathway_df = pd.DataFrame.from_records(pathway_mapping, columns=['number', 'name', 'superpathway'])
# pathway_df.to_csv('../pathways/pathway_manifest.csv', index=None)