In [1]:
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

## Web Scraping

In [2]:
compound_cid = '2244' # user input, CID number

parent_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/'
url = ''.join([parent_url, compound_cid, '/JSON'])
url

'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/2244/JSON'

In [3]:
response = requests.get(url)
response

<Response [200]>

In [4]:
json_file = response.json()
json_file

{'Record': {'RecordType': 'CID',
  'RecordNumber': 2244,
  'RecordTitle': 'Aspirin',
  'Section': [{'TOCHeading': 'Structures',
    'Description': 'Structure depictions and information for 2D, 3D, and crystal related',
    'Section': [{'TOCHeading': '2D Structure',
      'Description': 'A two-dimensional representation of the compound',
      'DisplayControls': {'MoveToTop': True},
      'Information': [{'ReferenceNumber': 235, 'Value': {'Boolean': [True]}}]},
     {'TOCHeading': '3D Conformer',
      'Description': 'A three-dimensional representation of the compound. The 3D structure is not experimentally determined, but computed by PubChem. More detailed information on this conformer model is described in the PubChem3D thematic series published in the Journal of Cheminformatics.',
      'DisplayControls': {'MoveToTop': True},
      'Information': [{'ReferenceNumber': 235,
        'Description': 'Aspirin',
        'Value': {'Number': [2244]}}]},
     {'TOCHeading': 'Crystal Structures

## Get Information

---

### 1. CAS Number

In [5]:
# 1. Get CAS Number
cas = json_file['Record']['Section'][2]['Section'][3]['Section'][0]['Information'][0]['Value']['StringWithMarkup'][0]['String']
cas

'50-78-2'

### 2. Compound Name

In [6]:
# 2. Get compound name
name = json_file['Record']['RecordTitle']
name

'Aspirin'

### 3. IUPAC Name

In [7]:
# 3. Get IUPAC name 
iupac = json_file['Record']['Section'][2]['Section'][1]['Section'][0]['Information'][0]['Value']['StringWithMarkup'][0]['String']
iupac

'2-acetyloxybenzoic acid'

### 4. InChI Key

In [8]:
# 4. Get InChI Key
inchi_key = json_file['Record']['Section'][2]['Section'][1]['Section'][2]['Information'][0]['Value']['StringWithMarkup'][0]['String']
inchi_key

'BSYNRYMUTXBXSQ-UHFFFAOYSA-N'

### 5. SMILES

In [9]:
# 5. Get SMILES
smiles = json_file['Record']['Section'][2]['Section'][1]['Section'][3]['Information'][0]['Value']['StringWithMarkup'][0]['String']
smiles

'CC(=O)OC1=CC=CC=C1C(=O)O'

### 6. Molecular Formula

In [10]:
# 6. Get Molecular Formula
molecular_formula = json_file['Record']['Section'][2]['Section'][2]['Information'][0]['Value']['StringWithMarkup'][0]['String']
molecular_formula

'C9H8O4'

### 7. Theoretical Molecular Weight

In [11]:
# 7. Get Theoretical MW
theoretical_mw = json_file['Record']['Section'][3]['Section'][0]['Section'][0]['Information'][0]['Value']['StringWithMarkup'][0]['String']
theoretical_mw

'180.16'

## Compile

In [12]:
cas_list = [cas]
name_list = [name]
iupac_list = [iupac]
inchi_key_list = [inchi_key]
smiles_list = [smiles]
molecular_formula_list = [molecular_formula]
theoretical_mw_list = [theoretical_mw]

In [13]:
data = {
    'CAS #': [], 
    'Compound Name': [],
    'Synonym': [],
    'Theory MW': [],
    'Formula': [],
    'Class': [],
    'Comment': [],
    'Column': [],
    'RT': [],
    'Chromatogram Comment': [],
    'Adduct Ion': [],
    'Adduct m/z': [],
    'Precursor Ion': [],
    'SP ID of Precursor Ion': [],
    'MS Stage': [],
    'Spectrum Comment': [],
    'Ionization': [],
    'Mass Range': [],
    'Collision Energy': [],
    'Collision Gas Vol.': [],
    'Polarity': [],
    'Instrument': [],
    'Data Folder': [],
    'Data Filename': [],
    'Acquired By': [],
    'Acquired': [],
    'Sample Info': [],
    'SMILES': [],
    'Structure': [],
    'InChIKey': []
}

library = pd.DataFrame(data)

library

Unnamed: 0,CAS #,Compound Name,Synonym,Theory MW,Formula,Class,Comment,Column,RT,Chromatogram Comment,...,Polarity,Instrument,Data Folder,Data Filename,Acquired By,Acquired,Sample Info,SMILES,Structure,InChIKey


In [14]:
# Convert elements in input_list into strings
input_list = pd.read_csv('input.txt', header=None, names=['CID Number']).astype(str)['CID Number'].to_list()
input_list

['16850', '6694', '13806', '24180873', '12961638', '6433718']

In [15]:
def create_library(input_list): 
    
    # To store information
    data = {
        'CAS #': [], 
        'Compound Name': [],
        'Synonym': [],
        'Theory MW': [],
        'Formula': [],
        'Class': [],
        'Comment': [],
        'Column': [],
        'RT': [],
        'Chromatogram Comment': [],
        'Adduct Ion': [],
        'Adduct m/z': [],
        'Precursor Ion': [],
        'SP ID of Precursor Ion': [],
        'MS Stage': [],
        'Spectrum Comment': [],
        'Ionization': [],
        'Mass Range': [],
        'Collision Energy': [],
        'Collision Gas Vol.': [],
        'Polarity': [],
        'Instrument': [],
        'Data Folder': [],
        'Data Filename': [],
        'Acquired By': [],
        'Acquired': [],
        'Sample Info': [],
        'SMILES': [],
        'Structure': [],
        'InChIKey': []
    }
    
    library = pd.DataFrame(data)
    
    cas_list = []
    name_list = []
    iupac_list = []
    inchi_key_list = []
    smiles_list = []
    molecular_formula_list = []
    theoretical_mw_list = []
    
    for compound_cid in input_list:
    
        parent_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/'
        url = ''.join([parent_url, compound_cid, '/JSON'])

        # Get response
        response = requests.get(url)
        
        # Get json
        json_file = response.json()

        # Get information for library
        try:
            cas = json_file['Record']['Section'][2]['Section'][3]['Section'][0]['Information'][0]['Value']['StringWithMarkup'][0]['String']
        except:
            cas = None
        
        try:
            name = json_file['Record']['RecordTitle']
        except:
            name = None
            
        try:
            iupac = json_file['Record']['Section'][2]['Section'][1]['Section'][0]['Information'][0]['Value']['StringWithMarkup'][0]['String']
        except:
            iupac = None
        
        try:
            inchi_key = json_file['Record']['Section'][2]['Section'][1]['Section'][2]['Information'][0]['Value']['StringWithMarkup'][0]['String']
        except:
            inchi_key = None
        
        try:
            smiles = json_file['Record']['Section'][2]['Section'][1]['Section'][3]['Information'][0]['Value']['StringWithMarkup'][0]['String']
        except:
            smiles = None
        
        try:
            molecular_formula = json_file['Record']['Section'][2]['Section'][2]['Information'][0]['Value']['StringWithMarkup'][0]['String']
        except:
            molecular_formula = None
        
        try:
            theoretical_mw = json_file['Record']['Section'][3]['Section'][0]['Section'][0]['Information'][0]['Value']['StringWithMarkup'][0]['String']
        except:
            molecular_formula = None
    
        cas_list.append(cas)
        name_list.append(name)
        iupac_list.append(iupac)
        theoretical_mw_list.append(theoretical_mw)
        molecular_formula_list.append(molecular_formula)
        smiles_list.append(smiles)
        inchi_key_list.append(inchi_key)
        
    library['CAS #'] = cas_list
    library['Compound Name'] = name_list
    library['Synonym'] = iupac_list
    library['Theory MW'] = theoretical_mw_list
    library['Formula'] = molecular_formula_list
    library['SMILES'] = smiles_list
    library['InChIKey'] = inchi_key_list
    
    return library

In [16]:
create_library(input_list)

Unnamed: 0,CAS #,Compound Name,Synonym,Theory MW,Formula,Class,Comment,Column,RT,Chromatogram Comment,...,Polarity,Instrument,Data Folder,Data Filename,Acquired By,Acquired,Sample Info,SMILES,Structure,InChIKey
0,2321-07-5,Fluorescein,"3',6'-dihydroxyspiro[2-benzofuran-3,9'-xanthen...",332.3,C20H12O5,,,,,,...,,,,,,,,C1=CC=C2C(=C1)C(=O)OC23C4=C(C=C(C=C4)O)OC5=C3C...,,GNBHRKFJIUUOQI-UHFFFAOYSA-N
1,81-88-9,Rhodamine B,[9-(2-carboxyphenyl)-6-(diethylamino)xanthen-3...,479.0,C28H31ClN2O3,,,,,,...,,,,,,,,CCN(CC)C1=CC2=C(C=C1)C(=C3C=CC(=[N+](CC)CC)C=C...,,PYWVYCXTNDRMGF-UHFFFAOYSA-N
2,989-38-8,Rhodamine 6G,"[9-(2-ethoxycarbonylphenyl)-6-(ethylamino)-2,7...",479.0,C28H31N2O3.Cl,,,,,,...,,,,,,,,CCNC1=CC2=C(C=C1C)C(=C3C=C(C(=[NH+]CC)C=C3O2)C...,,VYXSBFYARXAAKO-UHFFFAOYSA-N
3,548-26-517372-87-1,Eosin YS,"sodium;2-(2,4,5,7-tetrabromo-3-hydroxy-6-oxoxa...",670.9,C20H8Br4NaO5+,,,,,,...,,,,,,,,C1=CC=C(C(=C1)C2=C3C=C(C(=O)C(=C3OC4=C(C(=C(C=...,,YQGOJNYOYNNSMM-UHFFFAOYSA-N
4,16423-68-0,Erythrosine,"disodium;2',4',5',7'-tetraiodo-3-oxospiro[2-be...",879.9,C20H6I4Na2O5,,,,,,...,,,,,,,,C1=CC=C2C(=C1)C(=O)OC23C4=CC(=C(C(=C4OC5=C(C(=...,,RAGZEDHHTPQLAI-UHFFFAOYSA-L
5,18472-87-2,Acid red 92,"disodium;2',4',5',7'-tetrabromo-4,5,6,7-tetrac...",829.6,C20H2Br4Cl4Na2O5,,,,,,...,,,,,,,,C1=C2C(=C(C(=C1Br)[O-])Br)OC3=C(C(=C(C=C3C24C5...,,OOYIOIOOWUGAHD-UHFFFAOYSA-L
