In [1]:
from itertools import chain
import string
from collections import Counter

import pandas as pd
import requests
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import json
import requests
import json


In [2]:
def get_aspirin_info(record_number):
    base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound"
    compound_url = f"{base_url}/{record_number}/json"
    
    response = requests.get(compound_url)
    
    if response.status_code == 200:
        return response.json()
    else:
        return {"error": f"Failed to retrieve information for record number {record_number}"}

# Example usage:
aspirin_info = get_aspirin_info(2244)
print(json.dumps(aspirin_info, indent=2))


{
  "Record": {
    "RecordType": "CID",
    "RecordNumber": 2244,
    "RecordTitle": "Aspirin",
    "Section": [
      {
        "TOCHeading": "Structures",
        "Description": "Structure depictions of this compound, including computationally generated two-dimensional (2D) and three-dimensional (3D) structures, as well as experimentally determined 3D single-crystal structures.",
        "Section": [
          {
            "TOCHeading": "2D Structure",
            "Description": "A two-dimensional (2D) structure representation of the compound.  Because this structure is processed through chemical structure standardization (H\u00e4hnke et al., J. Cheminform. 2018, 10, 36), it is not necessarily the same as the structures provided by individual data contributors.  ",
            "URL": "https://doi.org/10.1186/s13321-018-0293-8",
            "DisplayControls": {
              "MoveToTop": true
            },
            "Information": [
              {
                "ReferenceNumbe

In [3]:
## Set list and create dataframe 
aspirin_refs: list[dict] = aspirin_info['Record'] ['Reference']
aspirin_df = pd.DataFrame(aspirin_refs)
print(aspirin_df)
aspirin_df.head()


     ReferenceNumber                                 SourceName  \
0                  1                                  BindingDB   
1                 12  Comparative Toxicogenomics Database (CTD)   
2                 28     Drug Gene Interaction database (DGIdb)   
3                 30                                   DrugBank   
4                 74           IUPHAR/BPS Guide to PHARMACOLOGY   
..               ...                                        ...   
233              239            Medical Subject Headings (MeSH)   
234              240            Medical Subject Headings (MeSH)   
235              241            Medical Subject Headings (MeSH)   
236              242                         PATENTSCOPE (WIPO)   
237              243                                       NCBI   

              SourceID                                     Name  \
0      Compound::22360                    2-(acetyloxy)benzoate   
1    D001241::Compound                                  Aspir

Unnamed: 0,ReferenceNumber,SourceName,SourceID,Name,Description,URL,LicenseNote,LicenseURL,ANID,IsToxnet
0,1,BindingDB,Compound::22360,2-(acetyloxy)benzoate,The Binding Database projects aims to make exp...,https://www.bindingdb.org/rwd/bind/chemsearch/...,All data curated by BindingDB staff are provid...,https://www.bindingdb.org/rwd/bind/info.jsp,19593358.0,
1,12,Comparative Toxicogenomics Database (CTD),D001241::Compound,Aspirin,"CTD is a robust, publicly available database t...",https://ctdbase.org/detail.go?type=chem&acc=D0...,It is to be used only for research and educati...,http://ctdbase.org/about/legal.jsp,9023303.0,
2,28,Drug Gene Interaction database (DGIdb),rxcui:1191,ASPIRIN,"The Drug Gene Interaction Database (DGIdb, www...",https://www.dgidb.org/drugs/rxcui:1191,The data used in DGIdb is all open access and ...,http://www.dgidb.org/downloads,37458282.0,
3,30,DrugBank,DB00945,Acetylsalicylic acid,The DrugBank database is a unique bioinformati...,https://www.drugbank.ca/drugs/DB00945,Creative Common's Attribution-NonCommercial 4....,https://www.drugbank.ca/legal/terms_of_use,3594228.0,
4,74,IUPHAR/BPS Guide to PHARMACOLOGY,4139::Compound,aspirin,The IUPHAR database contains quantitative info...,https://www.guidetopharmacology.org/GRAC/Ligan...,The Guide to PHARMACOLOGY database is licensed...,https://www.guidetopharmacology.org/about.jsp#...,19629119.0,


In [4]:
##Prepping text and tokenizing

text: str = aspirin_df ["Description"].str.cat(sep= " ")
tokens: list[str] = word_tokenize(text)
tokens_lc: list[str] = [token.lower() for token in tokens if token not in string.punctuation]
wnl = WordNetLemmatizer()
tokens_lc_nostop_lemmatized: list[str] = [t for t in tokens_lc if t not in stopwords.words('english')]


bag_of_words: Counter = Counter(tokens_lc_nostop_lemmatized)


In [5]:
bag_of_words.most_common(10)

[('database', 142),
 ('information', 99),
 ('mass', 79),
 ('substances', 75),
 ('access', 67),
 ('massbank.eu', 60),
 ('ufz', 60),
 ('drugs', 53),
 ('provides', 52),
 ('spectra', 52)]