In [1]:
import requests
import re
from tqdm import tqdm
import time
import pandas as pd

In [2]:
METACYC_PROTEINS_PATH = '/Users/Itai/Library/Mobile Documents/com~apple~CloudDocs/from_box/Grad/research/molecule_databases/Metacyc_v26.5/data/protein-seq-ids-unreduced.dat'

In [3]:
def query_genbank_proteins (id_list):
    """
    Retrieve protein FASTA for list of GenBank accession IDs
    """
    
    db = 'protein'
    ids_string = ','.join(id_list)
    base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    
    url = base + f"epost.fcgi?db={db}&id={ids_string}"
    
    res = requests.get(url).text
    web = re.findall('(?<=<WebEnv>).*(?=<\/WebEnv>)', res)
    key = re.findall('(?<=<QueryKey>).*(?=<\/QueryKey>)', res)
    if len(web) and len(key):
        web = web[0]
        key = key[0] 
        url2 = base + f"efetch.fcgi?db={db}&query_key={key}&WebEnv={web}"+ "&rettype=fasta&retmode=text"
        res = requests.get(url2).text.split('\n\n>')
    else:
        res = None
    return res

def query_batched (full_id_list, batch_size=100):
    """
    Retrieve protein FASTA for list of GenBank accession IDs in batches
    """
    list_of_lists = [full_id_list[i:i + batch_size] for i in range(0, len(full_id_list), batch_size)]
    query_results = []
    for ls in tqdm(list_of_lists):
        res = query_genbank_proteins(ls)
        query_results.extend(res)
    return query_results
    
def parse_query_results (result):
    """
    Extract information from queries to GenBank 
    """
    header = result.split('\n')[0]

    aid = header.split(' ')[0].replace('>','')
    escaped_aid = re.escape(aid)
    if '[' in header:
        species = re.findall('(?<=\[).*(?=\])', header)[0]
        common_name = re.findall(f'(?<={escaped_aid} ).*(?= \[)', header)[0]
    else:
        species = 'NA'
        print (aid)
        try:
            common_name = re.findall(f'(?<={escaped_aid} ).*$', header)[0]
        except:
            print (header)
            common_name = None

    sequence = ''.join(result.split('\n')[1:])
    
    return {'id':aid, 'species':species, 'common_name':common_name, 'sequence':sequence}

## GenBank

In [4]:
genbank_accessions = []
with open(METACYC_PROTEINS_PATH, 'r') as f:
    for line in f:
        genbank_accessions.extend(re.findall('(?<=PID:).+?(?=\")', line))

In [8]:
queried = query_batched(genbank_accessions)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 158/158 [06:08<00:00,  2.33s/it]


In [11]:
queried

['>AGC29953.1 CYP719A23 [Sinopodophyllum hexandrum]\nMEMEMSVLAMSSTLILALAMALIFLFKAKSSSAIKWPPGPKTLPIIGNLHQLGGDELHIVLAKLARVHGA\nIMTIWMAKKPVIVVSDVNSVWEVLVSKSSDYAARDAAEISKIVSASSHSINTSDSGPYWQTLRRGLTHGP\nLGPLNISAQIPIQQRDMQRVIREMQQDAAANGGIIKPLDHLKRSSTRLVSRLIFGDTFDNDPYNDSMHEV\nVQDLNRFGGIALLEQAFSFAKHLPSYKRGVKEFHIHKRKIDDLVRPVVASANPPSNSYLGFLQSQNYSEE\nIIIACIFELYLLAMDSSASTATWALAFMIRDQQVQEKLYQDIKRVIGDGVDLVKAEDLSKMHYLQAVVKE\nTMRMKPIAPLAIPHKTAIDTTVMGTKVPKGTCVMVNLYALHHDESVWAKPYTFMPERFLQGEDGKSVTEQ\nAFLPFGAGMRICGGMEVGKLQFSLALANLVNAFKWTSAAEGKLPDMSDELQFITVMKTPLEARIIPRNP',
 'AGC29954.1 CYP719A24 [Podophyllum peltatum]\nMEMETSVLGLSSTLIIALAITVIFLLKAKSSSAIKWPPGPKTLPIIGNLHQLGGDELHIVLAKLARVHGA\nIMTIWMAKKPVIVVSDVNSVWEVLVSKSSDYAARDAAEISKIVSASSHSINTSDSGPYWQTLRRGLTHGP\nLGPLNISAQIPIQQRDMQRVIREMQQDAAANGGVIKPLDHLKRSSTRLVSRLIFGDTFDNDPYNDSMHEV\nVQDLNRFGGIALLEQAFSFAKYLPSYKRGVKEFHIHKRKIDDLVRPVVASSNPPSNSYLGFLQSQNYSEE\nIIIACIFELYLLXXDSSASTATWALAFXIRDQQVQEKLYQDIKRVIGDEVGLVKAEDLSKMHYLQAVVKE\nTMRMKPIAPLAIPHKTAIDTSLMGTKVPKGTCVMVN

In [15]:
parsed_genbank_results = [parse_query_results(q) for q in tqdm(queried)]



  0%|                                                                                                                          | 0/15617 [00:00<?, ?it/s][A[A

 15%|████████████████▎                                                                                           | 2357/15617 [00:00<00:00, 23565.81it/s][A[A

 39%|██████████████████████████████████████████▍                                                                 | 6137/15617 [00:00<00:00, 31936.57it/s][A[A

sp|P53816.2|PLAT3_HUMAN
sp|P53816.2|PLAT3_HUMAN
AAA52495.1
AAA51588.1
AAA51589.1
sp|O23051.1|KAO1_ARATH
sp|P53816.2|PLAT3_HUMAN
pir||T43357
AAB59492.1
sp|O64692.1|G2OX3_ARATH
sp|Q9XFR9.1|G2OX2_ARATH
sp|Q8LEA2.2|G2OX1_ARATH
sp|O64692.1|G2OX3_ARATH
AAA36497.1
AAA36495.1
sp|P35354.2|PGH2_HUMAN
pir||T06787
sp|P53816.2|PLAT3_HUMAN
sp|P35354.2|PGH2_HUMAN
sp|P35354.2|PGH2_HUMAN
sp|P35354.2|PGH2_HUMAN
sp|P53816.2|PLAT3_HUMAN
sp|Q9LU36.1|4CL4_ARATH
pir||T06787


sp|P35354.2|PGH2_HUMAN
AAB59377.1
sp|P53816.2|PLAT3_HUMAN
sp|O23051.1|KAO1_ARATH
sp|O64692.1|G2OX3_ARATH
sp|Q9XFR9.1|G2OX2_ARATH
sp|Q8LEA2.2|G2OX1_ARATH
sp|Q38802.1|KSA_ARATH
AAB59377.1
sp|Q9XFR9.1|G2OX2_ARATH
sp|P35354.2|PGH2_HUMAN
pir||T06787
sp|P15954.1|COX7C_HUMAN
sp|P12074.4|CX6A1_HUMAN
sp|Q9XFR9.1|G2OX2_ARATH



 66%|██████████████████████████████████████████████████████████████████████▍                                    | 10282/15617 [00:00<00:00, 36277.28it/s][A[A

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 15617/15617 [00:00<00:00, 34562.43it/s][A[A


sp|O64692.1|G2OX3_ARATH
sp|P53816.2|PLAT3_HUMAN
sp|P53816.2|PLAT3_HUMAN
pir||T06787
sp|P53816.2|PLAT3_HUMAN
sp|O64692.1|G2OX3_ARATH
sp|Q9XFR9.1|G2OX2_ARATH
sp|Q8LEA2.2|G2OX1_ARATH
AAA52495.1
sp|Q0IS49|Q0IS49_ORYSA
sp|Q8H0F6.1|CFI3_LOTJA
AAA51588.1
AAA51589.1
sp|P53816.2|PLAT3_HUMAN
sp|P53816.2|PLAT3_HUMAN
AAA52495.1
sp|O23051.1|KAO1_ARATH
pir||T06787
sp|Q9SWE5.1|HAL3A_ARATH
sp|P53816.2|PLAT3_HUMAN
sp|O64692.1|G2OX3_ARATH
sp|Q9XFR9.1|G2OX2_ARATH
sp|Q8LEA2.2|G2OX1_ARATH
pir||T06787
sp|Q9LU36.1|4CL4_ARATH
pir||T43357
sp|O23051.1|KAO1_ARATH
sp|P53816.2|PLAT3_HUMAN
sp|P53816.2|PLAT3_HUMAN
pir||C49845
sp|O80765.2|PANK1_ARATH
sp|P93324.1|CHOMT_MEDSA
sp|P35354.2|PGH2_HUMAN





In [16]:
ids = [p['id'].split('.')[0] for p in parsed_genbank_results]

In [17]:
missing = [g for g in genbank_accessions if g.split('.')[0] not in ids]
print (len(set(missing)))
print (set(missing))

409
{'2636091', 'Q38802', '90337669', 'AAC99399', '2635124', '403330341', '110825044', '444895566', 'AAB58943', '2635916', '2633122', '110823929', 'AAH11021', 'AAH03102', '17981860', '444897109', '110823823', 'AAB87866', '444896614', '2636081', '225185293', '444895209', '2636078', 'CAA63721', '2635853', 'AAB58954', '2632420', 'AAH12068', '444894848', 'AAH00589', 'AAF91224', '444893770', '2632343', '2636099', 'AAA36350', 'AAC04267', '2633470', '444896873', '126698142', '2633524', '2634962', 'AAL50984', '2636093', '444893599', '2636085', 'CAA24036', '444897149', 'AAH10665', '444894847', 'AAD21526', '2636388', '2635547', '444896874', 'AAH01715', '444897119', '2633926', '2635396', '2636218', '225185127', '2636351', '225185286', '225185480', '2635821', 'AAD00084', '2636102', '2635866', '2632928', 'AAH20821', 'LC387598.1', '2633583', '444893547', '7544046', '2636095', '444893860', '110823923', 'AAu05951', '2636083', '2636518', '2634674', '126700901', 'AAH00813', '2635310', '32468827', '26361

In [27]:
gid_dict = {}
for gid in tqdm(set(missing)):
    res = query_genbank_proteins([gid])
    if res is not None:
        gid_dict[gid] = res[0][1:].split(' ')[0]
        time.sleep(0.5)
    else:
        print (gid)

  1%|█▍                                                                                                                  | 5/409 [00:09<15:20,  2.28s/it]

2635124


 68%|██████████████████████████████████████████████████████████████████████████████                                    | 280/409 [06:38<05:33,  2.59s/it]

AAH05936


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 409/409 [09:25<00:00,  1.38s/it]


In [31]:
converted_genbank_ids = [g if g not in gid_dict.keys() else gid_dict[g] for g in genbank_accessions]

In [32]:
new_missing = [g for g in converted_genbank_ids if g.split('.')[0] not in ids and g not in ids]
print (new_missing)
print (len(new_missing))

['Error%3A+CEFetchPApplication%3A%3Aproxy_stream()%3A+Error%3A+F+a+i+l+e+d++t+o++r+e+t+r+i+e+v+e++s+e+q+u+e+n+c+e+%3A++2+1+5+2+6+9+8+0+2+2+%0A%0A\n', 'AAD08677.1', 'AAD05419.1', 'AAD16091.1', 'AAD32453.1', 'AAH10665.1', 'BAA91208.1', 'AAK34953.1', 'AAL32064.1', 'CAA57489.1', 'AAD00084.1', 'AAH00266.1', 'AAH03674.1', 'AAD27762.1', 'AAC04270.1', 'BAB21453.1', 'AAH22369.1', 'AAD20959.1', 'AAD05420.1', 'AAH11021.1', 'AAB37259.1', 'AAB02224.1', 'AAH20821.1', 'AAH00813.1', 'BAB13985.1', 'AAD21526.1', 'AAH02772.1', 'AAC04267.1', 'AAH03102.1', 'AAC99399.1', 'AAD05427.1', 'AAH01016.1', 'AAD42056.1', 'AAA36350.1', 'AAH15837.1', 'AAH09311.1', 'AAH03351.1', 'AAD42055.1', 'CAA54099.1', 'AAH03417.1', 'AAL50984.1', 'AAD09755.1', 'CAD62165.1', 'AAB03380.1', 'AAH05936', 'AAF91224.1', 'AAF17196.1', 'AAH09189.1', 'AAH00589.2', 'AAG28167.1', 'AAG44670.1', 'AAD27748.1', 'AAF67481.1', 'AAD23566.1', 'AAC05814.1', 'NP_536843.1', 'CAA24026.1', 'AAA65501.1', 'AAB58943.1', 'AAC25449.1', 'CAA24027.1', 'NP_536844.

In [33]:
missing_queried = query_batched(new_missing, batch_size=1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.59s/it]


In [36]:
parsed_genbank_results.extend([parse_query_results(q) for q in tqdm(missing_queried)])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:00<00:00, 9373.51it/s]


In [37]:
new_ids = [p['id'].split('.')[0] for p in parsed_genbank_results]

In [39]:
final_missing = [g for g in converted_genbank_ids if g.split('.')[0] not in new_ids and g not in new_ids]
final_missing

['Error%3A+CEFetchPApplication%3A%3Aproxy_stream()%3A+Error%3A+F+a+i+l+e+d++t+o++r+e+t+r+i+e+v+e++s+e+q+u+e+n+c+e+%3A++2+1+5+2+6+9+8+0+2+2+%0A%0A\n',
 'Error%3A+CEFetchPApplication%3A%3Aproxy_stream()%3A+Error%3A+F+a+i+l+e+d++t+o++r+e+t+r+i+e+v+e++s+e+q+u+e+n+c+e+%3A++3+3+8+8+5+8+1+7+5+%0A%0A\n',
 '2635124',
 'Error%3A+CEFetchPApplication%3A%3Aproxy_stream()%3A+Error%3A+F+a+i+l+e+d++t+o++r+e+t+r+i+e+v+e++s+e+q+u+e+n+c+e+%3A++3+3+8+8+5+8+1+7+5+%0A%0A\n']

In [55]:
gid_dict

{'2636091': 'CAB15582.1',
 'Q38802': 'sp|Q38802.1|KSA_ARATH',
 '90337669': 'EAS51320.1',
 'AAC99399': 'AAC99399.1',
 '403330341': 'AFR42418.1',
 '110825044': 'ABH00326.1',
 '444895566': 'CCP44824.1',
 'AAB58943': 'AAB58943.1',
 '2635916': 'CAB15408.1',
 '2633122': 'CAB12627.1',
 '110823929': 'ABG99212.1',
 'AAH11021': 'AAH11021.1',
 'AAH03102': 'AAH03102.1',
 '17981860': 'NP_536850.1',
 '444897109': 'CCP46375.1',
 '110823823': 'ABG99106.1',
 'AAB87866': 'AAB87866.1',
 '444896614': 'CCP45877.1',
 '2636081': 'CAB15572.1',
 '225185293': 'CAB14841.2',
 '444895209': 'CCP44465.1',
 '2636078': 'CAB15569.1',
 'CAA63721': 'CAA63721.1',
 '2635853': 'CAB15345.1',
 'AAB58954': 'AAB58954.1',
 '2632420': 'CAB11929.1',
 'AAH12068': 'AAH12068.1',
 '444894848': 'CCP44104.1',
 'AAH00589': 'AAH00589.2',
 'AAF91224': 'AAF91224.1',
 '444893770': 'CCP43024.1',
 '2632343': 'CAB11852.1',
 '2636099': 'CAB15590.1',
 'AAA36350': 'AAA36350.1',
 'AAC04267': 'AAC04267.1',
 '2633470': 'CAB12974.1',
 '444896873': 'CC

In [61]:
pid_to_sequence = {}

for res in parsed_genbank_results:
    if 'id' in res.keys():
        pid_to_sequence[res['id'].split('.')[0]] = res['sequence']
        
for gid in gid_dict:
    if gid_dict[gid].split('.')[0] in pid_to_sequence.keys():
        pid_to_sequence[gid] = pid_to_sequence[gid_dict[gid].split('.')[0]]

## Uniprot

In [5]:
uniprot_accessions = []
with open(METACYC_PROTEINS_PATH, 'r') as f:
    for line in f:
        uniprot_accessions.extend(re.findall('(?<=UNIPROT:).*?\"', line))

In [64]:
unaccounted_for_accessions = [u for u in uniprot_accessions if ':' in u and u.split(':')[1] not in metacyc_proteins_df['REACTION_ID']]
print (len(unaccounted_for_accessions))

0


In [65]:
####
#### Used .py script instead
####

In [66]:
# from Uniprot
# >db|UniqueIdentifier|EntryName ProteinName OS=OrganismName OX=OrganismIdentifier [GN=GeneName ]PE=ProteinExistence SV=SequenceVersion
# db is 'sp' for UniProtKB/Swiss-Prot and 'tr' for UniProtKB/TrEMBL.
# UniqueIdentifier is the primary accession number of the UniProtKB entry.
# EntryName is the entry name of the UniProtKB entry.
# ProteinName is the recommended name of the UniProtKB entry as annotated in the RecName field. For UniProtKB/TrEMBL entries without a RecName field, the SubName field is used. In case of multiple SubNames, the first one is used. The 'precursor' attribute is excluded, 'Fragment' is included with the name if applicable.
# OrganismName is the scientific name of the organism of the UniProtKB entry.
# OrganismIdentifier is the unique identifier of the source organism, assigned by the NCBI.
# GeneName is the first gene name of the UniProtKB entry. If there is no gene name, OrderedLocusName or ORFname, the GN field is not listed.
# ProteinExistence is the numerical value describing the evidence for the existence of the protein.
# SequenceVersion is the version number of the sequence.

def extract_header_information(header):
    regex_pattern = r'>?([^|]+)\|([^|]+)\|([^ ]+?) (.*?)(?: OS=(.+?))(?: OX=(.+))?(?: GN=(.+?))?(?: PE=(.+?))(?: SV=(.+?))'
    match = re.match(regex_pattern, header)
    if match:
        db_value = match.group(1)
        unique_identifier = match.group(2)
        entry_name = match.group(3)
        protein_name = match.group(4)
        organism_name = match.group(5)
        organism_identifier = match.group(6)
        gene_name = match.group(7)
        protein_existence = match.group(8)
        sequence_version = match.group(9)

        return {
            "db": db_value,
            "UniqueIdentifier": unique_identifier,
            "EntryName": entry_name,
            "ProteinName": protein_name,
            "OrganismName": organism_name,
            "OrganismIdentifier": organism_identifier,
            "GeneName": gene_name,
            "ProteinExistence": protein_existence,
            "SequenceVersion": sequence_version
        }
    else:
        print ("Failed parsing")
        return None

# Example header with different 'db' values
# header_example = '>sp|Q01740|FMO1_HUMAN Flavin-containing monooxygenase 1 OS=Homo sapiens OX=9606 GN=FMO1 PE=1 SV=3'
header_example = '>sp|Q01740|FMO1_HUMAN Flavin-containing monooxygenase 1 OS=Homo sapiens OX=9606 PE=1 SV=3'
# For another 'db' value
# header_example = ">db2|789012|AnotherEntry AnotherProtein OS=Mouse OX=10090 PE=2 SV=2"

result = extract_header_information(header_example)
if result:
    print("Extracted Information:")
    for key, value in result.items():
        print(f"{key}: {value}")
else:
    print("Header format doesn't match the pattern.")


def parse_uniprot_result (uniprot_result):
    header = uniprot_result.split('\n')[0]
    sequence = ''.join(uniprot_result.split('\n')[1:])
    info = extract_header_information(header)
    if info is not None:
        info['sequence'] = sequence
    else:
        print (header)
        print (sequence)
    return info

def parse_uniprot_batch (uniprot_batch):
    result_ls = []
    for uniprot_result in uniprot_batch.split('\n>'):
        if uniprot_result is not None:
            parsed = parse_uniprot_result(uniprot_result)
            if parsed is not None:
                result_ls.append(parsed)
    
    return result_ls
# def parse_batched_uniprot_results (batch_results):

Extracted Information:
db: sp
UniqueIdentifier: Q01740
EntryName: FMO1_HUMAN
ProteinName: Flavin-containing monooxygenase 1
OrganismName: Homo sapiens
OrganismIdentifier: 9606
GeneName: None
ProteinExistence: 1
SequenceVersion: 3


In [67]:
### Load uniprot query results
with open('uniprot_chkpoint.txt', 'r') as f:
    uniprot_returned = f.readlines()

In [68]:
# parsed_uniprot = []
# for batch in uniprot_returned:
#     parsed_uniprot.extend(parse_uniprot_batch(batch))
parsed_uniprot = parse_uniprot_batch('\n'.join(uniprot_returned))

In [71]:
len(parsed_uniprot)

555195

In [72]:
parsed_uniprot[167838]

{'db': 'sp',
 'UniqueIdentifier': 'A8FKH6',
 'EntryName': 'SYS_CAMJ8',
 'ProteinName': 'Serine--tRNA ligase',
 'OrganismName': 'Campylobacter jejuni subsp. jejuni serotype O:6 (strain 81116 / NCTC 11828)',
 'OrganismIdentifier': '407148 GN=serS',
 'GeneName': None,
 'ProteinExistence': '3',
 'SequenceVersion': '1',
 'sequence': 'MLDLKNLQNNFDEVAKKLKNKKVDENILKKLAELFASLKKEKTALEEFQAFQNKFSKELATAEDKESLKAKLSENKSKINEQSVKVNALENELEEIAHAIPNIPDECVPVGEDEDENVELKKVLNPPSFDFTPKEHFELGESLNWLDFVRGVKISQSRFCVLKNEGALLSRALVNYMIDFNRSHGFEFVNVPFLVNGATMFGTGQLPKFKEDMYKVDDEDLYLISTSEIPVTNLYSGEILASETLPIKMTCYSACFRKEAGSAGRDTRGIIRQHQFEKVELVSITKPEQSDSVFNEMLECASDLLSSLGLAHRHLMLCTGDLGFSAAKTVDLEVWLPGQNKYREISSVSNCRDFQARRAKIRYKNEQGKNELVHTLNGSSLAVGRTLVAIMENYQDKEGKIHIPDALKKYF'}

In [48]:
#### SAVED JUST IN CASE
# with open('uniprot_chkpoint.txt', 'w') as f:
#     for line in returned_uniprot:
#         f.write(line)

In [49]:
def parse_metacyc_proteins(file_path):
    # Define the regular expression pattern for extracting entries
    results = []
    entry_pattern = re.compile(r'\((.*?)\)')

    # Open the file and read its content
    with open(file_path, 'r') as file:
        file_content = file.read()

    # Find all matches of the entry pattern in the file content
    entries = re.findall(r'\((.*?)\)', file_content, re.DOTALL)

    # Process each entry
    for entry in entries:
        # Split the entry into its components
        components = entry.split()

        # Extract R_ID and EC_ID
        r_id, ec_id = [c.strip('(') for c in components[:2]]

        # Extract PIDs
        pids = components[2:]

        # Remove quotes from PIDs
        pids = [pid.strip('"') for pid in pids]

        # Print or process the extracted information as needed
        
        for pid in pids:
            if ':' in pid:
                results.append({'REACTION_ID':r_id, 'EC':ec_id, 'SEQUENCE_ID':pid.split(':')[1], 'SEQUENCE_DB':pid.split(':')[0]})
            else:
                print (pid)
        
    return results
        

# Replace 'your_file.txt' with the actual path to your text file
metacyc_proteins_info = parse_metacyc_proteins(METACYC_PROTEINS_PATH)

AAL79181


In [307]:
for res in parsed_uniprot:
    pid_to_sequence[res['UniqueIdentifier']] = res['sequence']

In [76]:
cleaned_gid_dict = {}

for k,v in gid_dict.copy().items():
    if 'Error' in v or (v==''):
        pass
    elif '|' in v and len(v.split('|')[1]) > 0:
        cleaned_gid_dict[k] = gid_dict[k].split('|')[1].split('.')[0]
    else:
        cleaned_gid_dict[k] = gid_dict[k].split('.')[0]   

In [77]:
no_sequence = []
for entry in metacyc_proteins_info:
    clean_id = entry['SEQUENCE_ID'].split('.')[0]
    if clean_id in pid_to_sequence.keys():
        entry['sequence'] = pid_to_sequence[clean_id]
    elif clean_id in gid_dict.keys():
        try:
            entry['sequence'] = pid_to_sequence[cleaned_gid_dict[clean_id]]
        except:
            print ('ERRRORORR', entry)
            no_sequence.append(entry)
    else:
        print (entry)
        no_sequence.append(entry)

{'REACTION_ID': 'RXN-21004', 'EC': '4.1.2.61', 'SEQUENCE_ID': 'Q58VS8', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'RXN-16864', 'EC': 'NIL', 'SEQUENCE_ID': 'G8C421', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'N-METHYL-L-AMINO-ACID-OXIDASE-RXN', 'EC': '1.5.3.2', 'SEQUENCE_ID': 'A0A7Y8P8L3', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'N-METHYL-L-AMINO-ACID-OXIDASE-RXN', 'EC': '1.5.3.2', 'SEQUENCE_ID': 'A0A5Y1QQ15', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'N-METHYL-L-AMINO-ACID-OXIDASE-RXN', 'EC': '1.5.3.2', 'SEQUENCE_ID': 'A0A897GZR9', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'N-METHYL-L-AMINO-ACID-OXIDASE-RXN', 'EC': '1.5.3.2', 'SEQUENCE_ID': 'A0A844LU72', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'N-METHYL-L-AMINO-ACID-OXIDASE-RXN', 'EC': '1.5.3.2', 'SEQUENCE_ID': 'A0A8F7MTK1', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'N-METHYL-L-AMINO-ACID-OXIDASE-RXN', 'EC': '1.5.3.2', 'SEQUENCE_ID': 'A0A8F2C1B0', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'N-METHYL-L-AMINO-ACID-OXIDASE-RXN', 'EC': 

{'REACTION_ID': 'RXN-9660', 'EC': '1.3.1.9', 'SEQUENCE_ID': 'B1P0R8', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'RXN-23604', 'EC': '2.3.1.80', 'SEQUENCE_ID': 'A0A287A7R1', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'RXN-17996', 'EC': '1.14.19.49', 'SEQUENCE_ID': 'S4S3E1', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'RXN1G-395', 'EC': '1.3.1.9', 'SEQUENCE_ID': 'B1P0R8', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'RXN3O-4042', 'EC': '1.14.18.6', 'SEQUENCE_ID': 'O75213', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'RXN-21672', 'EC': '1.3.1.9', 'SEQUENCE_ID': 'B1P0R8', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'RXN-9663', 'EC': '1.3.1.9', 'SEQUENCE_ID': 'B1P0R8', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'SPHINGOMYELIN-PHOSPHODIESTERASE-RXN', 'EC': '3.1.4.12', 'SEQUENCE_ID': 'A2A5N7', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'RXN-13509', 'EC': '1.14.14.126', 'SEQUENCE_ID': 'F1T283', 'SEQUENCE_DB': 'UNIPROT'}
{'REACTION_ID': 'RXN-13509', 'EC': '1.14.14.126', 'SEQUENCE_ID': 'F1T282', 'SEQUENCE_DB

In [78]:
print (len(no_sequence))

2005


## Improve querying for uniprot sequences

In [79]:
metacyc_proteins_df = pd.DataFrame(metacyc_proteins_info)

In [81]:
no_sequence_in_df = set(metacyc_proteins_df.drop_duplicates(subset='REACTION_ID')['REACTION_ID'].tolist()) - set(metacyc_proteins_df.dropna(subset=['sequence']).drop_duplicates(subset='REACTION_ID')['REACTION_ID'].tolist())
no_sequence_in_df

{'2.7.99.1-RXN',
 'CYSTEINE-LYASE-RXN',
 'LINC-RXN',
 'LIND-RXN',
 'RIBOFLAVINASE-RXN',
 'RXN-10680',
 'RXN-10890',
 'RXN-10894',
 'RXN-11199',
 'RXN-15228',
 'RXN-15229',
 'RXN-15230',
 'RXN-16855',
 'RXN-16856',
 'RXN-16859',
 'RXN-16860',
 'RXN-16861',
 'RXN-16864',
 'RXN-16867',
 'RXN-17193',
 'RXN-17515',
 'RXN-17982',
 'RXN-17986',
 'RXN-18036',
 'RXN-18504',
 'RXN-18505',
 'RXN-19546',
 'RXN-19919',
 'RXN-19920',
 'RXN-20921',
 'RXN-21501',
 'RXN-21729',
 'RXN-22496',
 'RXN-22539',
 'RXN-23115',
 'RXN-23116',
 'RXN-23117',
 'RXN-23263',
 'RXN-23384',
 'RXN-8248',
 'RXN-9278',
 'RXN1G01-67'}

In [84]:
failed_to_get_sequence = metacyc_proteins_df[metacyc_proteins_df['REACTION_ID'].map(lambda x : x in no_sequence_in_df)]['REACTION_ID'].tolist()
metacyc_proteins_df[metacyc_proteins_df['REACTION_ID'].map(lambda x : x in no_sequence_in_df)]

Unnamed: 0,REACTION_ID,EC,SEQUENCE_ID,SEQUENCE_DB,sequence
1055,RXN-16864,NIL,G8C421,UNIPROT,
6951,RXN-15228,NIL,B5APQ9,UNIPROT,
30146,RXN-16867,NIL,G8C421,UNIPROT,
46254,RXN-17982,NIL,L8EUS1,UNIPROT,
71551,RXN-16860,NIL,G8C421,UNIPROT,
82749,RXN-21729,3.1.7.13,R4HEK6,UNIPROT,
90483,RIBOFLAVINASE-RXN,3.5.99.1,A4WEW0,UNIPROT,
101889,RXN-23263,1.1.3.50,LC387598.1,PID,
110989,RXN-19919,NIL,P75960-1,UNIPROT,
126979,RXN-18505,NIL,E7BBN5,UNIPROT,


In [357]:
# metacyc_proteins_df.to_csv('10Nov2023_metacyc_reactions_with_sequences.csv', index=False)

In [92]:
ids_to_requery = metacyc_proteins_df[metacyc_proteins_df['REACTION_ID'].map(lambda x : x in failed_to_get_sequence)]['SEQUENCE_ID'].tolist()

In [93]:
ids_to_requery

['G8C421',
 'B5APQ9',
 'G8C421',
 'L8EUS1',
 'G8C421',
 'R4HEK6',
 'A4WEW0',
 'LC387598.1',
 'P75960-1',
 'E7BBN5',
 'B5APQ9',
 'E1BYH8',
 'A0A3E2EIB6',
 'Q9U7N7',
 'Q2V3S9',
 'Q9ZN84',
 'L8EW04',
 'G8C419',
 'Q4JFF2',
 'A0A411MRB2',
 'A0A452CSY6',
 'G8C419',
 'E7BBN5',
 'A0A2W5DAZ9',
 'P95806',
 'A0A657M0P1',
 'G8C421',
 'A0A5C7KHP9',
 'A0A3E2ERT8',
 'Q7WST1',
 'L7RFF8',
 'B5APQ9',
 'A0A033V6J6',
 'A0A510WIR7',
 'A0A2R2JFI0',
 'Q81RQ9',
 'P50197',
 'Q38Q86',
 'G8C421',
 'A0A3E2ERT8',
 'D3QY17',
 'P75960-1']

In [94]:
requeried_uniprot = []
for u_id in tqdm(ids_to_requery):
    res = requests.get(f"https://rest.uniprot.org/uniprotkb/search?query={u_id}").text
    if "mergeDemergeTo" in res:
        new_id = re.findall('(?<=mergeDemergeTo":\[").+?(?="\])', res)[0]
        res = requests.get(f"https://rest.uniprot.org/uniprotkb/search?query={new_id}&format=fasta").text
    elif '"sequence":{"value":"' in res:
        res = requests.get(f"https://rest.uniprot.org/uniprotkb/search?query={u_id}&format=fasta").text
    requeried_uniprot.append(res)
    time.sleep(0.5)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [00:54<00:00,  1.29s/it]


In [98]:
parsed_requeried_uniprot = [parse_uniprot_result(r) for r in requeried_uniprot]

Failed parsing
{"results":[{"entryType":"Inactive","primaryAccession":"L8EUS1","uniProtkbId":"L8EUS1_STRR1","annotationScore":0.0,"inactiveReason":{"inactiveReasonType":"DELETED"}}]}

Failed parsing
{"results":[{"entryType":"Inactive","primaryAccession":"A4WEW0","uniProtkbId":"A4WEW0_ENT38","annotationScore":0.0,"inactiveReason":{"inactiveReasonType":"DELETED"}}]}

Failed parsing
{"results":[]}

Failed parsing
>sp|P75960-1|NPD_ECOLI Isoform CobB-Long of NAD-dependent protein deacylase OS=Escherichia coli (strain K12) OX=83333 GN=cobB
MLSRRGHRLSRFRKNKRRLRERLRQRIFFRDKVVPEAMEKPRVLVLTGAGISAESGIRTFRAADGLWEEHRVEDVATPEGFDRDPELVQAFYNARRRQLQQPEIQPNAAHLALAKLQDALGDRFLLVTQNIDNLHERAGNTNVIHMHGELLKVRCSQSGQVLDWTGDVTPEDKCHCCQFPAPLRPHVVWFGEMPLGMDEIYMALSMADIFIAIGTSGHVYPAAGFVHEAKLHGAHTVELNLEPSQVGNEFAEKYYGPASQVVPEFVEKLLKGLKAGSIA
Failed parsing
{"results":[{"entryType":"Inactive","primaryAccession":"E1BYH8","uniProtkbId":"E1BYH8_CHICK","annotationScore":0.0,"inactiveReason":{"inactiveReasonType":"DELETED"}}

In [99]:
updated_pid_to_sequence = pid_to_sequence.copy()
for idx, entry in zip(ids_to_requery, parsed_requeried_uniprot):
    if entry is not None:
        updated_pid_to_sequence[idx] = entry['sequence']

In [100]:
for idx in metacyc_proteins_df[metacyc_proteins_df['sequence'].isna()].index:
    if metacyc_proteins_df.loc[idx, 'SEQUENCE_ID'] in updated_pid_to_sequence:
        metacyc_proteins_df.loc[idx, 'sequence'] = updated_pid_to_sequence[metacyc_proteins_df.loc[idx, 'SEQUENCE_ID']]

# metacyc_proteins_df.dropna(subset=['sequence'])

In [103]:
new_no_sequence_in_df = set(metacyc_proteins_df.drop_duplicates(subset='REACTION_ID')['REACTION_ID'].tolist()) - set(metacyc_proteins_df.dropna(subset=['sequence']).drop_duplicates(subset='REACTION_ID')['REACTION_ID'].tolist())


In [104]:
new_no_sequence_in_df

{'CYSTEINE-LYASE-RXN',
 'LINC-RXN',
 'LIND-RXN',
 'RIBOFLAVINASE-RXN',
 'RXN-17515',
 'RXN-17982',
 'RXN-17986',
 'RXN-18036',
 'RXN-19546',
 'RXN-19919',
 'RXN-19920',
 'RXN-22496',
 'RXN-23115',
 'RXN-23116',
 'RXN-23117',
 'RXN-23263',
 'RXN-23384',
 'RXN-9278',
 'RXN1G01-67'}

In [105]:
metacyc_proteins_df.dropna(subset=['sequence']).drop_duplicates(subset=['REACTION_ID'])

Unnamed: 0,REACTION_ID,EC,SEQUENCE_ID,SEQUENCE_DB,sequence
0,RXN-16360,1.14.19.72,AGC29953.1,PID,MEMEMSVLAMSSTLILALAMALIFLFKAKSSSAIKWPPGPKTLPII...
4,CYTIDYLATE-CYCLASE-RXN,4.6.1.6,P0DV38,UNIPROT,MVNHIRIFDNLFQSNISKFQNLTSKSYIIRNDNEKNSYLPMVQEIR...
7,RXN-9796,1.1.1.213,P23457,UNIPROT,MDSISLRVALNDGNFIPVLGFGTTVPEKVAKDEVIKATKIAIDNGF...
8,RXN-21494,1.17.99.11,A0A127F6U7,UNIPROT,MVHREPRLEDIPASSLRFLGKDVQRIDDVALVSGSVEFIDNISVPG...
14,RXN-9232,1.14.13.240,Q3J6F1,UNIPROT,MTQAADILIAGGGLNGPALALALARNGFSVTVVDGRPAPQRAEPGF...
...,...,...,...,...,...
573319,RXN-19533,NIL,Q2T5S0,UNIPROT,MSAAEPHYIDAQRAIAPVDAPLAAPHEYAAVLRSDFVSSYHDGRDV...
573320,RXN-22033,NIL,M7RFT5,UNIPROT,MLTTSLTLNKEKWKPIWNKALVFLFVATYFLDGITRYKHLIIILMV...
573321,RXN-13291,1.3.1.97,G0Y288,UNIPROT,MKLREVLQHPGEIIPLLQMMVMAYRRKRKPQDPNLAWCWETLIKVS...
573322,RXN-19036,1.8.5.8,AAH16836,PID,MVPLVAVVSGPRAQLFACLLRLGTQQVGPLQLHTGASHAARNHYEV...


In [106]:
metacyc_proteins_df.to_csv('21Nov2023_metacyc_reactions_with_sequences.csv', index=False)

In [107]:
import json
with open ('protein_id_to_sequence.json', 'w') as f:
    json.dump(updated_pid_to_sequence, f)

In [108]:
with open('protein_id_to_sequence.json','r') as f:
    test_dict = json.load(f)

In [109]:
test_dict['AAH16836']

'MVPLVAVVSGPRAQLFACLLRLGTQQVGPLQLHTGASHAARNHYEVLVLGGGSGGITMAARMKRKVGAENVAIVEPSERHFYQPIWTLVGAGAKQLSSSGRPTASVIPSGVEWIKARVTELNPDKNCIHTDDDEKISYRYLIIALGIQLDYEKIKGLPEGFAHPKIGSNYSVKTVEKTWKALQDFKEGNAIFTFPNTPVKCAGAPQKIMYLSEAYFRKTGKRSKANIIFNTSLGAIFGVKKYADALQEIIQERNLTVNYKKNLIEVRADKQEAVFENLDKPGETQVISYEMLHVTPPMSPPDVLKTSPVADAAGWVDVDKETLQHRRYPNVFGIGDCTNLPTSKTAAAVAAQSGILDRTISVIMKNQTPTKKYDGYTSCPLVTGYNRVILAEFDYKAEPLETFPFDQSKERLSMYLMKADLMPFLYWNMMLRGYWGGPAFLRKLFHLGMS'

Unnamed: 0,REACTION_ID,EC,SEQUENCE_ID,SEQUENCE_DB,sequence
0,RXN-16360,1.14.19.72,AGC29953.1,PID,MEMEMSVLAMSSTLILALAMALIFLFKAKSSSAIKWPPGPKTLPII...
4,CYTIDYLATE-CYCLASE-RXN,4.6.1.6,P0DV38,UNIPROT,MVNHIRIFDNLFQSNISKFQNLTSKSYIIRNDNEKNSYLPMVQEIR...
7,RXN-9796,1.1.1.213,P23457,UNIPROT,MDSISLRVALNDGNFIPVLGFGTTVPEKVAKDEVIKATKIAIDNGF...
8,RXN-21494,1.17.99.11,A0A127F6U7,UNIPROT,MVHREPRLEDIPASSLRFLGKDVQRIDDVALVSGSVEFIDNISVPG...
14,RXN-9232,1.14.13.240,Q3J6F1,UNIPROT,MTQAADILIAGGGLNGPALALALARNGFSVTVVDGRPAPQRAEPGF...
...,...,...,...,...,...
573319,RXN-19533,NIL,Q2T5S0,UNIPROT,MSAAEPHYIDAQRAIAPVDAPLAAPHEYAAVLRSDFVSSYHDGRDV...
573320,RXN-22033,NIL,M7RFT5,UNIPROT,MLTTSLTLNKEKWKPIWNKALVFLFVATYFLDGITRYKHLIIILMV...
573321,RXN-13291,1.3.1.97,G0Y288,UNIPROT,MKLREVLQHPGEIIPLLQMMVMAYRRKRKPQDPNLAWCWETLIKVS...
573322,RXN-19036,1.8.5.8,AAH16836,PID,MVPLVAVVSGPRAQLFACLLRLGTQQVGPLQLHTGASHAARNHYEV...


In [124]:
metacyc_reactions_df = pd.read_csv('metacyc/08Mar2023_metacyc_reaction_smiles_no_cofs.tsv', sep='\t')

In [131]:
metacyc_reactions_df = metacyc_reactions_df.merge(metacyc_proteins_df.dropna(subset=['sequence']).drop_duplicates('REACTION_ID'), 
                           left_on='UNIQUE-ID', right_on='REACTION_ID', how='left')

In [166]:
metacyc_reactions_df.to_csv('metacyc/21Nov2023_metacyc_reaction_smiles_no_cofs_with_sequences.tsv', sep='\t', index=False)

## Parse queries from Rhea

In [137]:
with open('uniprot_chkpoint_for_rhea.txt','r') as f:
    uniprot_rhea_returned = f.readlines()

In [138]:
parsed_uniprot_rhea = parse_uniprot_batch('\n'.join(uniprot_rhea_returned))

In [139]:
rhea_proteins_df = pd.read_csv('rhea/rhea2uniprot_sprot.tsv', sep='\t')

In [140]:
metacyc_proteins_df # pd.DataFrame(metacyc_proteins_info)

Unnamed: 0,REACTION_ID,EC,SEQUENCE_ID,SEQUENCE_DB,sequence
0,RXN-16360,1.14.19.72,AGC29953.1,PID,MEMEMSVLAMSSTLILALAMALIFLFKAKSSSAIKWPPGPKTLPII...
1,RXN-16360,1.14.19.72,AGC29954.1,PID,MEMETSVLGLSSTLIIALAITVIFLLKAKSSSAIKWPPGPKTLPII...
2,RXN-16360,1.14.19.72,L7T720,UNIPROT,MEMETSVLGLSSTLIIALAITVIFLLKAKSSSAIKWPPGPKTLPII...
3,RXN-16360,1.14.19.72,L7T8H2,UNIPROT,MEMEMSVLAMSSTLILALAMALIFLFKAKSSSAIKWPPGPKTLPII...
4,CYTIDYLATE-CYCLASE-RXN,4.6.1.6,P0DV38,UNIPROT,MVNHIRIFDNLFQSNISKFQNLTSKSYIIRNDNEKNSYLPMVQEIR...
...,...,...,...,...,...
573325,RXN-19036,1.8.5.8,AAD41160,PID,MVPLVAVVSGPRAQLFACLLRLGTQQVGPLQLHTGASHAARNHYEV...
573326,RXN-19036,1.8.5.8,Q9R112,UNIPROT,MAPLVTVVSSPRARLFACFLRLGTQQAGPLQLHTGACCTAKNHYEV...
573327,RXN-19036,1.8.5.8,Q9Y6N5,UNIPROT,MVPLVAVVSGPRAQLFACLLRLGTQQVGPLQLHTGASHAARNHYEV...
573328,RXN-19036,1.8.5.8,Q54DK1,UNIPROT,MFKSIMYALAVAPAVTSSSDKLPNGVVSASQLGSEKEKRKLKNVTK...


In [141]:
for entry in parsed_uniprot_rhea:
    updated_pid_to_sequence[entry['UniqueIdentifier']] = entry['sequence']

In [142]:
rhea_proteins_df['sequence'] = [updated_pid_to_sequence[idx] for idx in rhea_proteins_df['ID']]

In [165]:
rhea_proteins_df.to_csv('12Nov2023_rhea_proteins_with_sequences.csv', index=False)

In [146]:
rhea_reactions = pd.read_csv('rhea/08Mar2023_rhea_reaction_smiles_no_cofs.csv', sep='\t')

In [154]:
rid_to_seq = pd.Series(rhea_proteins_df['sequence'].values, index=rhea_proteins_df['RHEA_ID'].values).to_dict()

In [157]:
idx2seq = {}
for idx in rhea_reactions.index:
    rhea_ids = [int(x) for x in re.findall('(?<=RHEA:)[0-9]+', rhea_reactions.loc[idx,'ID'])]
    for rid in rhea_ids:
        if rid in rid_to_seq.keys():
            idx2seq[idx] = rid_to_seq[rid]
        

In [158]:
len(idx2seq)

9157

In [159]:
rhea_reactions['sequence'] = [idx2seq[idx] if idx in idx2seq.keys() else None for idx in rhea_reactions.index]

In [164]:
rhea_reactions[rhea_reactions['sequence'].isna()]

Unnamed: 0,ID,reaction_string,reaction_chebi,reaction_smiles,reaction_smiles_no_cofs,sequence
3,['RHEA:15875'],[(1->4)-alpha-D-glucosyl](n+1) + a ribonucleos...,CHEBI:15444 + CHEBI:57930 + CHEBI:15378 => CHE...,,*[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])[O-])[...,
5,['RHEA:17887'],acetylene + H2O => acetaldehyde,CHEBI:27518 + CHEBI:15377 => CHEBI:15343,C#C.O>>CC=O,C#C.O>>CC=O,
6,['RHEA:31261'],3-butynoate + 2 H(+) + NADH => but-3-ynal + H2...,CHEBI:62211 + 2 CHEBI:15378 + CHEBI:57945 => C...,C#CCC(=O)[O-].[H+].[H+].NC(=O)C1=CN([C@@H]2O[C...,C#CCC(=O)[O-].[H+].[H+].NC(=O)C1=CN([C@@H]2O[C...,
7,"['RHEA:31259', 'RHEA:31260', 'RHEA:31262']",but-3-ynal + H2O + NAD(+) <=> 3-butynoate + 2 ...,CHEBI:28180 + CHEBI:15377 + CHEBI:57540 <=> CH...,C#CCC=O.O.NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O...,C#CCC=O.O.NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O...,
8,"['RHEA:31263', 'RHEA:31264', 'RHEA:31266']",but-3-ynal + H(+) + pyrroloquinoline quinol <=...,CHEBI:28180 + CHEBI:15378 + CHEBI:77660 <=> CH...,C#CCC=O.[H+].O=C([O-])c1cc(C(=O)[O-])c2c(n1)c(...,C#CCC=O.[H+].O=C([O-])c1cc(C(=O)[O-])c2c(n1)c(...,
...,...,...,...,...,...,...
22604,"['RHEA:49964', 'RHEA:49965', 'RHEA:49967']","5,10-dihydrophenazine + O2 <=> H2O2 + phenazine",CHEBI:132008 + CHEBI:15379 <=> CHEBI:16240 + C...,c1ccc2c(c1)Nc1ccccc1N2.O=O>>OO.c1ccc2nc3ccccc3...,c1ccc2c(c1)Nc1ccccc1N2.O=O>>OO.c1ccc2nc3ccccc3...,
22605,"['RHEA:42472', 'RHEA:42473', 'RHEA:42475']","dibenzo-p-dioxin + NADH + O2 <=> 2,2',3-trihyd...",CHEBI:28891 + CHEBI:57945 + CHEBI:15379 <=> CH...,c1ccc2c(c1)Oc1ccccc1O2.NC(=O)C1=CN([C@@H]2O[C@...,c1ccc2c(c1)Oc1ccccc1O2.NC(=O)C1=CN([C@@H]2O[C@...,
22608,"['RHEA:42460', 'RHEA:42461', 'RHEA:42463']",dibenzofuran + H(+) + NADH + O2 <=> biphenyl-2...,CHEBI:28145 + CHEBI:15378 + CHEBI:57945 + CHEB...,c1ccc2c(c1)oc1ccccc12.[H+].NC(=O)C1=CN([C@@H]2...,c1ccc2c(c1)oc1ccccc12.[H+].NC(=O)C1=CN([C@@H]2...,
22612,"['RHEA:16893', 'RHEA:16894', 'RHEA:16896']",pyridine + S-adenosyl-L-methionine <=> N-methy...,CHEBI:16227 + CHEBI:59789 <=> CHEBI:15761 + CH...,c1ccncc1.C[S+](CC[C@H]([NH3+])C(=O)[O-])C[C@H]...,c1ccncc1.C[S+](CC[C@H]([NH3+])C(=O)[O-])C[C@H]...,


In [162]:
rhea_reactions

Unnamed: 0,ID,reaction_string,reaction_chebi,reaction_smiles,reaction_smiles_no_cofs,sequence
0,"['RHEA:10068', 'RHEA:10069', 'RHEA:10070', 'RH...","(+)-sesamin + (6S)-5,6,7,8-tetrahydrofolyl-(ga...",2 CHEBI:13193 + CHEBI:60530 + CHEBI:15377 <=> ...,,,MPALACLRRLCRHLSPQAVLFLLFVFCLFSVFVSAYYLYGWNRGLE...
1,"['RHEA:17377', 'RHEA:17378', 'RHEA:17380']",a dolichyl beta-D-mannosyl phosphate + L-seryl...,CHEBI:58211 + CHEBI:29999 <=> CHEBI:137321 + C...,,*N[C@@H](CO)C(*)=O>>*N[C@@H](CO[C@H]1O[C@H](CO...,MSPHGDGRGQAQGRAVRVGLRRSGGIRGGVAVFAAVAAVFTFTLPP...
2,"['RHEA:53396', 'RHEA:53397', 'RHEA:53399']",a dolichyl beta-D-mannosyl phosphate + L-threo...,CHEBI:58211 + CHEBI:30013 <=> CHEBI:137323 + C...,,*N[C@H](C(*)=O)[C@@H](C)O>>*N[C@H](C(*)=O)[C@@...,MSPHGDGRGQAQGRAVRVGLRRSGGIRGGVAVFAAVAAVFTFTLPP...
3,['RHEA:15875'],[(1->4)-alpha-D-glucosyl](n+1) + a ribonucleos...,CHEBI:15444 + CHEBI:57930 + CHEBI:15378 => CHE...,,*[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])[O-])[...,
4,"['RHEA:53020', 'RHEA:53021', 'RHEA:53023']",a plastoquinone + an L-alpha-amino acid + H2O ...,CHEBI:17757 + CHEBI:59869 + CHEBI:15377 <=> CH...,,*[C@H]([NH3+])C(=O)[O-].O>>[1*]C(=O)C(=O)[O-]....,MVIRSGKTNLNPPCALMAPSSSCDCIIVGSGLSGLIAARNLSRVNY...
...,...,...,...,...,...,...
22614,['RHEA:71001'],(S)-nornicotine + formaldehyde + H(+) + H2O + ...,CHEBI:190184 + CHEBI:16842 + CHEBI:15378 + CHE...,c1cncc([C@@H]2CCC[NH2+]2)c1.C=O.[H+].O.Cc1cc2n...,c1cncc([C@@H]2CCC[NH2+]2)c1.C=O.[H+].O.Cc1cc2n...,
22615,['RHEA:17871'],"6,7-dihydropteridine + H(+) + NADH => 5,6,7,8-...",CHEBI:30156 + CHEBI:15378 + CHEBI:57945 => CHE...,c1ncc2c(n1)=NCCN=2.[H+].NC(=O)C1=CN([C@@H]2O[C...,c1ncc2c(n1)=NCCN=2.[H+].NC(=O)C1=CN([C@@H]2O[C...,MAAAAAGEARRVLVYGGRGALGSRCVQAFRARNWWVASIDVVENEE...
22616,['RHEA:17867'],"6,7-dihydropteridine + H(+) + NADPH => 5,6,7,8...",CHEBI:30156 + CHEBI:15378 + CHEBI:57783 => CHE...,c1ncc2c(n1)=NCCN=2.[H+].NC(=O)C1=CN([C@@H]2O[C...,c1ncc2c(n1)=NCCN=2.[H+].NC(=O)C1=CN([C@@H]2O[C...,MAAAAAGEARRVLVYGGRGALGSRCVQAFRARNWWVASIDVVENEE...
22617,"['RHEA:17869', 'RHEA:17870', 'RHEA:17872']","5,6,7,8-tetrahydropteridine + NAD(+) <=> 6,7-d...",CHEBI:28889 + CHEBI:57540 <=> CHEBI:30156 + CH...,c1ncc2c(n1)NCCN2.NC(=O)c1ccc[n+]([C@@H]2O[C@H]...,c1ncc2c(n1)NCCN2.NC(=O)c1ccc[n+]([C@@H]2O[C@H]...,MAAAAAGEARRVLVYGGRGALGSRCVQAFRARNWWVASIDVVENEE...


In [167]:
rhea_reactions.to_csv('rhea/21Nov2023_rhea_reaction_smiles_no_cofs_with_sequences.csv', index=False, sep='\t')