In [1]:
import requests
import re
import os

# Get all E.C. Numbers available in KEGG

In [17]:
database = "enzyme"
kegg_url = f"http://rest.kegg.jp/list/{database}"

In [18]:
r = requests.get(kegg_url)

In [19]:
r_small = r.content[:5000]
r_entire = r.content

In [20]:
pattern = b"ec:[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}"

small_ecs = re.findall(pattern, b"my_eyes ec:1.1.1.1 my eyes")
print(small_ecs)

[b'ec:1.1.1.1']


In [21]:
all_ecs = re.findall(pattern, r_entire)

In [22]:
len(all_ecs)

7813

In [23]:
len(set(all_ecs))

5854

In [3]:
all_ec_file = "./data/ECnumbers.txt"

In [24]:
with open(all_ec_file, "wb") as f:
    for ecs in set(all_ecs):
        f.write(ecs)
        f.write(b"\n")

# Get all gene identifiers from ECs

In [2]:
def get_ec_content(ec_num):
    ec_url = f"http://rest.kegg.jp/get/{ec_num}"
    ec_req = requests.get(ec_url)
    con = ec_req.content
    return con
    

In [10]:
def get_gene_content(content):
    pattern_start = b"GENES"
    pattern_end = b"DBLINKS"
    m_start = re.search(pattern_start, content)
    m_end = re.search(pattern_end, content)
    if m_start is not None:
        gene_start = m_start.end()
        gene_end = m_end.start()
        gene_content = content[gene_start:gene_end]
        return gene_content
    else:
        return
    

In [11]:
def save_ec_file(gene_content, ec_num):
    species_genes = gene_content.strip().split(b"\n")
    with open("./data/ec_gene_info/" + ec_num + ".txt", "wb") as f:
        for sg in species_genes:
            f.write(re.sub(b"\s+", b",", sg.strip()))
            f.write(b"\n")
    return

In [2]:
def save_ec_files_gene_info(file):
    stream = open(file)
    no_info = set()
    while True:
        try:
            line = next(stream)
            ec_num = line.strip()
            con = get_ec_content(ec_num)
            gene_content = get_gene_content(con)
            if gene_content is not None:
                save_ec_file(gene_content, ec_num)
            else:
                no_info.add(ec_num)

        except:
            break

    stream.close()
    return no_info

In [1]:
%%time
save_ec_files_gene_info(all_ec_file)

# Get all substrate identifiers from EC numbers

In [13]:
def get_substrate_content(content):
    pattern_start = b"SUBSTRATE"
    pattern_end = b"PRODUCT"
    m_start = re.search(pattern_start, content)
    m_end = re.search(pattern_end, content)
    if m_start is not None:
        subs_start = m_start.end()
        subs_end = m_end.start()
        subs_content = content[subs_start:subs_end]
        return subs_content
    else:
        return

In [14]:
def save_ec_subs_file(subs_content, ec_num):
    subs = subs_content.strip().split(b"\n")
    with open("./data/ec_subs_info/" + ec_num + ".txt", "wb") as f:
        for sub in subs:
            f.write(sub.strip())
            f.write(b"\n")
    return

In [15]:
def save_ec_subs_info(file):
    stream = open(file)
    no_info = set()
    while True:
        try:
            line = next(stream)
            ec_num = line.strip()
            con = get_ec_content(ec_num)
            subs_content = get_substrate_content(con)
            if subs_content is not None:
                save_ec_subs_file(subs_content, ec_num)
            else:
                no_info.add(ec_num)

        except:
            break

    stream.close()
    return no_info

In [None]:
%%time
no_info_ec_subs = save_ec_subs_info(all_ec_file)

# Download sequences for gene identifiers

In [9]:
def get_gene_content(gene_id):
    url = f"http://rest.kegg.jp/get/{gene_id}"
    r = requests.get(url)
    if r.status_code != 200:
        print(gene_id)
    return r.content


def get_aa_sequence(content):
    pattern_start = b"AASEQ"
    pattern_end = b"NTSEQ"
    m_start = re.search(pattern_start, content)
    m_end = re.search(pattern_end, content)
    if m_start is not None:
        aa_start = m_start.end()
        aa_end = m_end.start()
        aa_content = content[aa_start:aa_end]
        return aa_content
    else:
        return

    
def save_aa_seq(file):
    stream = open(file)
    filename = os.path.basename(file)
    lines = stream.readlines()
    max_len = 5
    curr_len = 0
    with open(f"./data/gene_info/{filename}", "wb+") as f:
        for line in lines:
            if curr_len>max_len:
                break

            # one species for an ec
            genes = line#next(stream)
            all_genes = genes.strip().split(",")
            prefix = all_genes[0].lower()

            for gene_idx in all_genes[1:]:
                # get rid of anything after "()"
                pattern = "\(.+\)"
                gene_idx = re.sub(pattern, "", gene_idx)
                search_query = prefix + gene_idx
                con = get_gene_content(search_query)
                seq_info = get_aa_sequence(con).strip().split(b"\n")
                if seq_info is None:
                    print(gene_idx)
                seq_len = int(seq_info[0])
                seq_list = list(map(lambda x:x.strip(), seq_info[1:]))
                seq = b"".join(seq_list)
#                assert len(seq) == seq_len
                f.write(b">" + search_query.encode("utf-8") + b"\n")
                f.write(seq)
                f.write(b"\n")
                
            curr_len+=1
            
    stream.close()
    return

In [9]:
filename = "./data/ec_gene_info/ec:1.1.1.2.txt"

In [10]:
%%time
save_aa_seq(filename)

CPU times: user 18.5 ms, sys: 1.05 ms, total: 19.6 ms
Wall time: 2.15 s


In [3]:
ec_info_dir = "./data/ec_gene_info/"
all_files= [f.name for f in os.scandir(ec_info_dir)]


In [12]:
%%time
for f in all_files:
    if f.endswith(".txt"):
        # this is an ec file
        filename = ec_info_dir + f
        save_aa_seq(filename)

# Reaction IDs from ECs

In [107]:
def get_reaction_content(content):
    pattern_start = b"ALL_REAC"
    pattern_end = b"SUBSTRATE"
    m_start = re.search(pattern_start, content)
    m_end = re.search(pattern_end, content)
    if m_start is not None and m_end is not None:
        reac_start = m_start.end()
        reac_end = m_end.start()
        reac_content = content[reac_start:reac_end]
        return reac_content
    else:
        return

In [112]:
def save_reaction_ids(file):
    stream = open(file, "rb")
    no_info = set()
    reaction_id_pattern = re.compile(b"R[0-9]{5}")
    write_stream = open("Reaction_ids_ec.txt", "wb")
    encoding = 'utf-8'

    idx = 0
    for line in stream.readlines():
        ec_num = line.strip()
        con = get_ec_content(str(ec_num, encoding))
        if con:
            reaction_content = get_reaction_content(con)
        else:
            reaction_content = None


        if reaction_content is not None:
            # write the ec_num
            write_stream.write(ec_num + b"\n")

            # Split into multiple lines
            all_reaction_lines = reaction_content.strip().split(b"\n")
            # parse through each line
            for reaction_line in all_reaction_lines:
                # check to see if '>' is present, that means there is a general-specific reaction division
                if b">" in reaction_line:
                    split_con = reaction_line.split(b">")
                    general_con = split_con[0]
                    specific_con = split_con[1]
                    general_reaction_ids = re.findall(reaction_id_pattern, general_con)
                    specific_reaction_ids = re.findall(reaction_id_pattern, specific_con)
                    # writing the general reaction id
                    if len(general_reaction_ids) != 1:
                        print(ec_num, general_reaction_ids)
                    write_stream.write(b">" + general_reaction_ids[0] + b"\n")
                    # writing the specific reaction id
                    write_stream.write(b",".join(specific_reaction_ids))
                    write_stream.write(b"\n")

                else:
                    # no general or specific here just parse for reaction ids
                    reaction_ids = re.findall(reaction_id_pattern, reaction_line)
                    write_stream.write(b",".join(reaction_ids))
                    write_stream.write(b"\n")


        else:
            # no reaction info for these ec numbers
            no_info.add(ec_num)

        idx += 1
        
        
        if idx/5000 in [i/10 for i in range(1, 9)]:
            print("*", end="")


    stream.close()
    write_stream.close()
    return no_info

In [113]:
no_reaction_info = save_reaction_ids(all_ec_file)

***b'ec:2.4.1.47' [b'R01500', b'R06277']
***b'ec:2.4.1.15' [b'R00836', b'R06043']
**b'ec:1.1.99.18' [b'R01443', b'R06246']
b'ec:2.4.1.10' [b'R05140', b'R06079']


In [101]:
stream = open(all_ec_file)
all_ecs = stream.readlines()