# Description

We have downloaded all 234,773 CAS IDs from the Mole DB as text files. We will be parsing the KEGG database using the [Biopython-enabled API](http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc271) to extract all pathways, enzymes, drug and metabolites

In [1]:
from Bio.KEGG import REST
from Bio.KEGG import Enzyme
from matplotlib import pyplot as plt
%matplotlib inline
import pandas as pd

In [26]:
pathways = REST.kegg_list("pathway").read()

In [27]:
pathways = pathways.rstrip().split("\n")

In [29]:
# Get metabolic pathways only
metabolic_pathways = []
for p in pathways:
    entry,description = p.split('\t')
    path_number = entry.replace("path:map",'')
    if int(path_number) < 1500:
        metabolic_pathways.append([entry,description])
    else:
        continue

In [30]:
metabolic_pathways

[['path:map00010', 'Glycolysis / Gluconeogenesis'],
 ['path:map00020', 'Citrate cycle (TCA cycle)'],
 ['path:map00030', 'Pentose phosphate pathway'],
 ['path:map00040', 'Pentose and glucuronate interconversions'],
 ['path:map00051', 'Fructose and mannose metabolism'],
 ['path:map00052', 'Galactose metabolism'],
 ['path:map00053', 'Ascorbate and aldarate metabolism'],
 ['path:map00061', 'Fatty acid biosynthesis'],
 ['path:map00062', 'Fatty acid elongation'],
 ['path:map00071', 'Fatty acid degradation'],
 ['path:map00072', 'Synthesis and degradation of ketone bodies'],
 ['path:map00073', 'Cutin, suberine and wax biosynthesis'],
 ['path:map00100', 'Steroid biosynthesis'],
 ['path:map00120', 'Primary bile acid biosynthesis'],
 ['path:map00121', 'Secondary bile acid biosynthesis'],
 ['path:map00130', 'Ubiquinone and other terpenoid-quinone biosynthesis'],
 ['path:map00140', 'Steroid hormone biosynthesis'],
 ['path:map00190', 'Oxidative phosphorylation'],
 ['path:map00195', 'Photosynthesis']

In [33]:
REST.kegg_get('path:map00010').read().split('\n')

['ENTRY       map00010                    Pathway',
 'NAME        Glycolysis / Gluconeogenesis',
 'DESCRIPTION Glycolysis is the process of converting glucose into pyruvate and generating small amounts of ATP (energy) and NADH (reducing power). It is a central pathway that produces important precursor metabolites: six-carbon compounds of glucose-6P and fructose-6P and three-carbon compounds of glycerone-P, glyceraldehyde-3P, glycerate-3P, phosphoenolpyruvate, and pyruvate [MD:M00001]. Acetyl-CoA, another important precursor metabolite, is produced by oxidative decarboxylation of pyruvate [MD:M00307]. When the enzyme genes of this pathway are examined in completely sequenced genomes, the reaction steps of three-carbon compounds from glycerone-P to pyruvate form a conserved core module [MD:M00002], which is found in almost all organisms and which sometimes contains operon structures in bacterial genomes. Gluconeogenesis is a synthesis pathway of glucose from noncarbohydrate precursors. I

# Extract all compound CAS and PubChem numbers

In [2]:
compounds = REST.kegg_list('compound').read()

In [39]:
compounds[0:100]

"cpd:C00001\tH2O; Water\ncpd:C00002\tATP; Adenosine 5'-triphosphate\ncpd:C00003\tNAD+; NAD; Nicotinamide a"

In [3]:
kegg_compound_ids = []
for line in compounds.rstrip().split('\n'):
    entry,description = line.split('\t')
    kegg_compound_ids.append(entry)

In [4]:
c = kegg_compound_ids[0]
test = REST.kegg_get(c).read()
test

'ENTRY       C00001                      Compound\nNAME        H2O;\n            Water\nFORMULA     H2O\nEXACT_MASS  18.0106\nMOL_WEIGHT  18.0153\nREMARK      Same as: D00001\nREACTION    R00001 R00002 R00004 R00005 R00009 R00010 R00011 R00017 \n            R00022 R00024 R00025 R00026 R00028 R00036 R00041 R00044 \n            R00045 R00047 R00048 R00052 R00053 R00054 R00055 R00056 \n            R00058 R00059 R00060 R00061 R00068 R00069 R00070 R00072 \n            R00073 R00074 R00077 R00078 R00080 R00081 R00082 R00083 \n            R00084 R00085 R00086 R00087 R00088 R00090 R00097 R00102 \n            R00103 R00107 R00111 R00113 R00118 R00122 R00123 R00125 \n            R00128 R00131 R00132 R00135 R00138 R00143 R00145 R00146 \n            R00148 R00149 R00152 R00155 R00159 R00160 R00164 R00173 \n            R00175 R00177 R00181 R00182 R00183 R00184 R00187 R00188 \n            R00191 R00192 R00193 R00194 R00195 R00199 R00203 R00205 \n            R00208 R00213 R00218 R00227 R00243 R00245 

In [4]:
pubchem_numbers = []
chebi_numbers = []
chembl_numbers = []
for compound in kegg_compound_ids:
    c = compound.replace('cpd:','')
    test = REST.kegg_get(c).read()
    
    try:
        pubchem = test[test.index('PubChem'):].split('\n')[0].replace('PubChem: ','')
        pubchem_numbers.append(pubchem)
    except:
        pubchem_numbers.append('')
    
    try:
        chebi = test[test.index('ChEBI'):].split('\n')[0].replace('ChEBI: ','')
        chebi_numbers.append(chebi)
    except:
        chebi_numbers.append('')
    
    try:
        chembl = test[test.index('ChEMBL'):].split('\n')[0].replace('ChEMBL: ','')
        chembl_numbers.append(chembl)
    except:
        chembl_numbers.append('')

In [5]:
import pandas as pd

In [6]:
conversion_table = pd.DataFrame()
conversion_table['KEGG'] = kegg_compound_ids
conversion_table['PUBCHEM'] = pubchem_numbers
conversion_table['CHEBI'] = chebi_numbers
conversion_table['CHEMBL'] = chembl_numbers
conversion_table.to_csv('KEGG_to_OTHER.csv')

In [13]:
len(set(list(conversion_table['CHEMBL'])))

7323

In [54]:
cas_numbers = []
for compound in kegg_compound_ids:
    c = compound.replace('cpd:','')
    test = REST.kegg_get(c).read()
    try:
        cas = test[test.index('CAS:'):].split('\n')[0].replace('CAS: ','')
        cas_numbers.append(cas)
    except:
        cas_numbers.append('')

In [58]:
conversion_table = pd.DataFrame()
conversion_table['KEGG'] = kegg_compound_ids
conversion_table['CAS'] = cas_numbers
conversion_table.to_csv('KEGG_to_CAS.csv')

In [62]:
df = conversion_table.copy()

In [66]:
df = df.sort_values('CAS',ascending=False)

In [68]:
df.to_csv('KEGG_to_CAS.tab',sep='\t',index=False)

In [69]:
df = pd.read_csv('KEGG_to_CAS.tab',sep='\t')
df

Unnamed: 0,KEGG,CAS
0,cpd:C08702,99815-83-5
1,cpd:C00449,997-68-2
2,cpd:C01042,997-55-7
3,cpd:C09653,99694-82-3
4,cpd:C08973,99633-18-8
5,cpd:C10560,99633-12-2
6,cpd:C09976,99624-92-7
7,cpd:C10424,99624-64-3
8,cpd:C09760,99624-28-9
9,cpd:C07325,99614-02-5


# Extract all drug CAS numbers

In [2]:
drug = REST.kegg_list("drug").read()

In [3]:
kegg_drug_ids = []
for line in drug.rstrip().split('\n'):
    entry,description = line.split('\t')
    kegg_drug_ids.append(entry)

In [4]:
kegg_drug_ids

['dr:D00001',
 'dr:D00002',
 'dr:D00003',
 'dr:D00004',
 'dr:D00005',
 'dr:D00006',
 'dr:D00007',
 'dr:D00008',
 'dr:D00009',
 'dr:D00010',
 'dr:D00011',
 'dr:D00012',
 'dr:D00013',
 'dr:D00014',
 'dr:D00015',
 'dr:D00016',
 'dr:D00017',
 'dr:D00018',
 'dr:D00019',
 'dr:D00020',
 'dr:D00021',
 'dr:D00022',
 'dr:D00023',
 'dr:D00024',
 'dr:D00025',
 'dr:D00026',
 'dr:D00027',
 'dr:D00028',
 'dr:D00029',
 'dr:D00030',
 'dr:D00031',
 'dr:D00032',
 'dr:D00033',
 'dr:D00034',
 'dr:D00035',
 'dr:D00036',
 'dr:D00037',
 'dr:D00038',
 'dr:D00039',
 'dr:D00040',
 'dr:D00041',
 'dr:D00042',
 'dr:D00043',
 'dr:D00044',
 'dr:D00045',
 'dr:D00046',
 'dr:D00047',
 'dr:D00048',
 'dr:D00049',
 'dr:D00050',
 'dr:D00051',
 'dr:D00052',
 'dr:D00054',
 'dr:D00055',
 'dr:D00056',
 'dr:D00057',
 'dr:D00058',
 'dr:D00059',
 'dr:D00060',
 'dr:D00061',
 'dr:D00062',
 'dr:D00063',
 'dr:D00064',
 'dr:D00065',
 'dr:D00066',
 'dr:D00067',
 'dr:D00068',
 'dr:D00069',
 'dr:D00070',
 'dr:D00071',
 'dr:D00072',
 'dr:D

In [5]:
len(kegg_drug_ids)

10441

In [33]:
#drug_names = []
for i in range(9304,len(kegg_drug_ids)):
    test = REST.kegg_get(kegg_drug_ids[i]).read()
    try:
        test = test[test.index('NAME '):].split('\n')
        name = test[0].replace('NAME        ','')
        for line in test[1:]:
            if ' ' == line[0]:
                name = name + line.replace('            ','')
            else:
                drug_names.append(name)
                break
    except:
        drug_names.append('')

In [34]:
len(drug_names)

10441

In [32]:
drug_names[9303]

'Regavirumab (JAN/INN)'

In [35]:
df = pd.DataFrame()
df['KEGG'] = kegg_drug_ids
df['Drug name'] = drug_names
df.to_csv('KEGG_drug_names.tab',sep='\t',index=False)

In [36]:
df

Unnamed: 0,KEGG,Drug name
0,dr:D00001,Water (JP17/USP);Purified water (JP17);Purifie...
1,dr:D00002,Nadide (JAN/USAN/INN);Nicotinamide adenine din...
2,dr:D00003,Oxygen (JP17/USP)
3,dr:D00004,Carbon dioxide (JP17/USP);Carbon dioxide (TN)
4,dr:D00005,Flavin adenine dinucleotide (JAN);Adeflavin (TN)
5,dr:D00006,Pyridoxal phosphate hydrate (JAN);Pyridoxal ph...
6,dr:D00007,Glutamic acid (USP);L-Glutamic acid (JP17)
7,dr:D00008,Hydrogen peroxide (USP);Oxydol (JP17);Oxyfull ...
8,dr:D00009,Glucose (JP17);D-Glucose
9,dr:D00010,"Acetic acid, glacial (USP);Acetic acid (JP17/N..."


In [5]:
drug_pubchem_numbers = []
for drug in kegg_drug_ids:
    d = drug.replace('dr:','')
    test = REST.kegg_get(d).read()
    try:
        pubchem = test[test.index('PubChem:'):].split('\n')[0].replace('PubChem: ','')
        drug_pubchem_numbers.append(pubchem)
    except:
        drug_pubchem_numbers.append('')

In [6]:
drug_conversion_table = pd.DataFrame()
drug_conversion_table['KEGG'] = kegg_drug_ids
drug_conversion_table['PUBCHEM'] = drug_pubchem_numbers
drug_conversion_table = drug_conversion_table.sort_values('PUBCHEM',ascending=False)
drug_conversion_table.to_csv('KEGG_to_PUBCHEM_DRUGS.tab',sep='\t',index=False)
#drug_conversion_table.to_excel('KEGG_to_PUBCHEM_DRUGS.xlsx',index=False)

In [99]:
drug_cas_numbers = []
for drug in kegg_drug_ids:
    d = drug.replace('dr:','')
    test = REST.kegg_get(d).read()
    try:
        cas = test[test.index('CAS:'):].split('\n')[0].replace('CAS: ','')
        drug_cas_numbers.append(cas)
    except:
        drug_cas_numbers.append('')

In [100]:
drug_conversion_table = pd.DataFrame()
drug_conversion_table['KEGG'] = kegg_drug_ids
drug_conversion_table['CAS'] = drug_cas_numbers
drug_conversion_table = drug_conversion_table.sort_values('CAS',ascending=False)
drug_conversion_table.to_csv('KEGG_to_CAS_DRUGS.tab',sep='\t',index=False)
drug_conversion_table.to_excel('KEGG_to_CAS_DRUGS.xlsx',index=False)

In [106]:
len([i for i in drug_conversion_table['CAS'] if str(i) != ''])

8728

In [105]:
len(drug_conversion_table)

10342

In [4]:
df = pd.read_csv('MolDB_complete_ID_list.tab',sep='\t')
df

Unnamed: 0,MC_no,NCI_no,Name,Formula,CAS_no,MW,SMILES
0,121100,101100,"((3-((cyano(2,3-dimethoxyphenyl)methyl)amino)p...",C23H28N4O4,,424.55,N#CC(NCCCNC(C#N)C1=C(OC)C(=CC=C1)OC)C2=C(OC)C(...
1,121101,101101,,C30H32N4O5,,528.66,N#CC(N2C(N(C(C#N)C1=C(OC)C(=CC=C1)OC)CCC2)C3=C...
2,121102,101102,,C32H30N4O3,,518.66,N#CC(N2C(N(C(C#N)C1=C(OC)C=CC=C1)CCC2)C3=C4C(=...
3,121103,101103,,C30H32N4O4,,512.66,N#CC(N2C(N(C(C#N)C1=C(OC)C=CC=C1)CCC2)C3=C(C=C...
4,121104,101104,,C30H32N4O4,,512.66,N#CC(N2C(N(C(C#N)C1=C(OC)C=CC=C1)CCC2)C3=C(C=C...
5,121105,101105,"17-ethynyl-17-hydroxyestra-4,9,11-trien-3-one",C20H22O2,848-21-5,294.42,CC34C(C2C(=C1C(=CC(=O)CC1)CC2)C=C3)CCC4(C#C)O
6,121106,101106,"1-(2,6-diethylphenyl)-2,5-dimethyl-1H-pyrrole",C16H21N,,227.38,CCC1=C(C(=CC=C1)CC)[N]2C(=CC=C2C)C
7,121107,101107,"1-(2,6-diethylphenyl)-2-methyl-5-phenyl-1H-pyr...",C21H23N,,289.45,CC1=CC=C([N]1C2=C(C=CC=C2CC)CC)C3=CC=CC=C3
8,121108,101108,"1,2,2,3-tetramethylcyclopentanecarbonyl chloride",C10H17ClO,,188.72,ClC(=O)C1(C)C(C)(C)C(C)CC1
9,121109,101109,"2-chloro-1-(1,2,2,3-tetramethylcyclopentyl)eth...",C11H19ClO,,202.75,ClCC(=O)C1(C)C(C)(C)C(C)CC1


# Overlap between 2D molecular descriptors in ChemoPy and PaDEL

In [22]:
chemopy = pd.read_csv('example_result_chemopy.csv')
padel = pd.read_csv('example_result_padel.csv')

In [24]:
A = list(chemopy.columns)
B = list(padel.columns)

In [31]:
print len(A),len(B),len(A)+len(B)
print "number of 2D and 1 D descriptors in ChemoPy and PaDEL is",len(set(A+B))

633 1545 2178
number of 2D and 1 D descriptors in ChemoPy and PaDEL is 2160


In [33]:
kegg_1 = pd.read_csv('KEGG_to_CAS.tab',sep='\t')
kegg_2 = pd.read_csv('KEGG_to_OTHER.csv')
pubchem_smiles = pd.read_csv('PubChem_to_SMILES_Metabolites.txt',sep='\t')

In [38]:
test = pd.merge(kegg_1,kegg_2,on='KEGG')

In [40]:
final = pd.merge(test,pubchem_smiles,on='PUBCHEM')

In [41]:
final

Unnamed: 0,KEGG,CAS,PUBCHEM,CHEBI,CHEMBL,SMILES
0,cpd:C08702,99815-83-5,10895,7657,,CCC(=O)OCC(C)C
1,cpd:C00449,997-68-2,3737,16927,,CC(=O)NC1=C(C(=C(C(=C1I)C(=O)O)I)C(=O)NC)I
2,cpd:C01042,997-55-7,4284,16953 21547,CHEMBL1162494,CCN(CC)C(=O)C1=CC(=CC=C1)C
3,cpd:C09653,99694-82-3,11843,4343,,CC1=C(C=CC=C1O)O
4,cpd:C08973,99633-18-8,11165,8901,,CCCN.Cl
5,cpd:C10560,99633-12-2,12743,4914,CHEMBL1076999,CCCCCOCCCCC
6,cpd:C09976,99624-92-7,12162,9861,CHEMBL494641,C1=CC(=CC=C1C#N)Br
7,cpd:C10424,99624-64-3,12609,5718,,C1=CC(=CC(=C1)NC=O)C(F)(F)F
8,cpd:C09760,99624-28-9,11948,6118,CHEMBL465167,C1=CC=C(C=C1)CC2=CC3=CC=CC=C3C=C2
9,cpd:C07325,99614-02-5,9533,7773,CHEMBL46,C(CCl)NCCCl


In [42]:
index_nan = [i for i in range(len(final)) if str(final['SMILES'][i])=='nan']

In [44]:
len(final) - len(index_nan)

13742

In [46]:
final.to_csv('KEGG_to_ALL_OTHERS.tab',sep='\t',index=False)

# Extract all enzymes

In [2]:
enzymes = REST.kegg_list('enzyme').read()

In [6]:
enzymes.split('\n')

['ec:1.1.1.1\talcohol dehydrogenase; aldehyde reductase; ADH; alcohol dehydrogenase (NAD); aliphatic alcohol dehydrogenase; ethanol dehydrogenase; NAD-dependent alcohol dehydrogenase; NAD-specific aromatic alcohol dehydrogenase; NADH-alcohol dehydrogenase; NADH-aldehyde dehydrogenase; primary alcohol dehydrogenase; yeast alcohol dehydrogenase',
 'ec:1.1.1.2\talcohol dehydrogenase (NADP+); aldehyde reductase (NADPH2); NADP-alcohol dehydrogenase; NADP+-aldehyde reductase; NADP+-dependent aldehyde reductase; NADPH-aldehyde reductase; NADPH-dependent aldehyde reductase; nonspecific succinic semialdehyde reductase; ALR 1; low-Km aldehyde reductase; high-Km aldehyde reductase; alcohol dehydrogenase (NADP)',
 'ec:1.1.1.3\thomoserine dehydrogenase; HSDH; HSD',
 'ec:1.1.1.4\t(R,R)-butanediol dehydrogenase; butyleneglycol dehydrogenase; D-butanediol dehydrogenase; D-(-)-butanediol dehydrogenase; butylene glycol dehydrogenase; diacetyl (acetoin) reductase; D-aminopropanol dehydrogenase; 1-amino-2

In [7]:
REST.kegg_get('ec:1.1.1.1').read().split('\n')

['ENTRY       EC 1.1.1.1                  Enzyme',
 'NAME        alcohol dehydrogenase;',
 '            aldehyde reductase;',
 '            ADH;',
 '            alcohol dehydrogenase (NAD);',
 '            aliphatic alcohol dehydrogenase;',
 '            ethanol dehydrogenase;',
 '            NAD-dependent alcohol dehydrogenase;',
 '            NAD-specific aromatic alcohol dehydrogenase;',
 '            NADH-alcohol dehydrogenase;',
 '            NADH-aldehyde dehydrogenase;',
 '            primary alcohol dehydrogenase;',
 '            yeast alcohol dehydrogenase',
 'CLASS       Oxidoreductases;',
 '            Acting on the CH-OH group of donors;',
 '            With NAD+ or NADP+ as acceptor',
 'SYSNAME     alcohol:NAD+ oxidoreductase',
 'REACTION    (1) a primary alcohol + NAD+ = an aldehyde + NADH + H+ [RN:R00623];',
 '            (2) a secondary alcohol + NAD+ = a ketone + NADH + H+ [RN:R00624]',
 'ALL_REAC    R00623 > R00754 R02124 R02878 R04805 R04880 R05233 R05234 R06917 R069

# Get all genes from an organism

- **Escherichia coli MG1655 (eco)**
- Salmonella enterica subsp. enterica serovar Typhi CT18 (sty)
- Enterobacter sp. 638 (ent)
- **Klebsiella pneumoniae KCTC 2242 (kpo)**
- Yersinia intermedia (yin)
- Erwinia tasmaniensis (eta)
- Pantoea ananatis LMG 20103 (pam)
- Actinobacillus succinogenes (asu)
- **Xanthomonas campestris pv. campestris ATCC 33913 (xcc)**
- **Vibrio cholerae O1 biovar El Tor N16961 (vch)**
- Photobacterium profundum (ppr)
- Pseudomonas aeruginosa PAO1 (pae)
- **Pseudomonas putida KT2440 (ppu)**
- Acinetobacter baumannii ATCC 17978 (acb)
- Shewanella denitrificans (sdn)
- Burkholderia mallei ATCC 23344 (bma)
- **Helicobacter pylori 26695 (hpy)**
- Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 (cje)
- Bacillus subtilis subsp. subtilis 168 (bsu)
- **Staphylococcus aureus subsp. aureus N315 (MRSA/VSSA) (sau)**
- Lactococcus lactis subsp. lactis Il1403 (lla)
- **Streptococcus pyogenes M1 GAS (serotype M1) (spy)**
- Streptococcus pneumoniae TIGR4 (virulent serotype 4) (spn)
- Lactobacillus plantarum WCFS1 (lpl)
- **Lactobacillus acidophilus NCFM (lac)**
- Lactobacillus casei BL23 (lcb)
- **Clostridium acetobutylicum ATCC 824 (cac)**
- Streptomyces coelicolor (sco)
- Bifidobacterium longum NCC2705 (blo)
- Chlamydia trachomatis D/UW-3/CX (ctr)

In [22]:
# Select organism
organism = 'ppu'

In [23]:
genes = REST.kegg_list(organism).read()

In [24]:
genes = [g.split('\t') for g in genes.split('\n')]
genes = genes[0:len(genes)-1]

In [26]:
metabolic_genes = []
for gene in genes:
    text = REST.kegg_get(gene[0]).read()
    try: 
        if "PATHWAY" in text or "BRITE" in text:
            metabolic_genes.append(gene)
    except:
        try: 
            if "PATHWAY" in text or "BRITE" in text:
                metabolic_genes.append(gene)
        except:
            try: 
                if "PATHWAY" in text or "BRITE" in text:
                    metabolic_genes.append(gene)
            except:
                continue

In [27]:
len(metabolic_genes)

2603

In [28]:
pputida_metabolic_genes = pd.DataFrame(columns=['KEGG_ID','Description'],data=metabolic_genes)
pputida_metabolic_genes.head()

Unnamed: 0,KEGG_ID,Description
0,ppu:PP_0001,parB; chromosome-partitioning protein
1,ppu:PP_0002,parA; chromosome partition protein
2,ppu:PP_0003,rsmG; 16S RNA methyltransferase
3,ppu:PP_0004,trmF; tRNA uridine 5-carboxymethylaminomethyl ...
4,ppu:PP_0005,trmE; GTPase


In [29]:
pputida_metabolic_genes.to_excel('Pputida_KEGG_Metabolic_genes.xlsx',index=False)

In [30]:
len(pputida_metabolic_genes)

2603

In [9]:
genes.index(gene)

4295