<a href="https://colab.research.google.com/github/atjoelpark/ml-disparities-mit/blob/master/pull_preprocessing/LCP_Pull_Extract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pull and Extraction of LCP PMIDs

This notebook provides documentation and code for pulling metadata for a list of PMID IDs.

Reference: https://lcp.mit.edu/publications

In [None]:
# Customization
# Please enter in the pathway in your Google Drive (after /content/drive/) that you would like your files to be saved into
# Users will only need to modify this code then run all the cells in order
google_drive_url = ""

# Libraries and Mounting Google Drive

In [None]:
# Importing libraries
import numpy as np 
import pandas as pd 
import re 
import requests
from bs4 import BeautifulSoup

In [None]:
# Mounting Google Drive if using Google Drive
from google.colab import drive
drive.mount(f'/content/drive/{google_drive_url}')

Mounted at /content/drive/


## Defining Functions

In [None]:
# Installing E-utilities Entrez Direct
def e_utilities_install():
  """
  Installs e_utilities
  Reference: https://www.ncbi.nlm.nih.gov/books/NBK179288/
  """
  !curl -L https://www.ncbi.nlm.nih.gov/books/NBK179288/bin/install-edirect.sh > install-edirect.sh
  !bash install-edirect.sh -y
  !echo 'export PATH=\$PATH:\$HOME/edirect' >> $HOME/.bash_profile
  !rm install-edirect.sh

In [None]:
# Setting development environment for Selenium
def setup_dev_environment():
  """
  Installs chromium, driver and selenium
  Sets options to be headless
  Opens a website and prepares Selenium for use
  Returns: webdriver
  """

  # install chromium, its driver, and selenium
  !apt update
  !apt install chromium-chromedriver
  !pip install selenium
  !pip install webdriver_manager
  # set options to be headless, ..
  from selenium import webdriver
  options = webdriver.ChromeOptions()
  options.add_argument('--headless')
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-dev-shm-usage')
  # open it, go to a website, and get results
  print("Chromium, Driver and Selenium successfully started....")
  return webdriver.Chrome(options=options)

In [None]:
# Defining Functions
def pull_pmid(year: int, wb) -> list:
  """
  
  This takes in a year as an argument and reads the PMID IDs for each year within https://lcp.mit.edu/publications

  Dependencies: Selenium, chromium-chromedriver, webdriver_manager, re
  @param year: This int contains the year to scrape from in https://lcp.mit.edu/publications
  @param wb: Passes in the webdriver for Selenium
  @return: a list of PMIDs
  @raise TypeError: raises an exception
  """
  try:
    # Uses Selenium to search by CSS
    URL = f'https://lcp.mit.edu/publications#P_{year}'
    wd.get(URL)
    _links = wd.find_elements_by_css_selector('.bib2xhtml a+ a')

    # Initiating an empty PMID list and appends to list with all PMID IDs
    _pmid_list = []
    for i in _links:
      tmp_search = re.findall(r'\(PMID:.*\)', i.text)
      if tmp_search:
        _pmid_list.append(tmp_search)

    # Flattends the list
    _pmid_list = [item for sublist in _pmid_list for item in sublist]

    # Extracts out only integers and removes text and special characters. 
    # Returns the list
    _pmid_list = [int(re.findall(r'\d+', i)[0]) for i in _pmid_list]
    return _pmid_list

  except TypeError as e:
    print("Error raised while pulling PMID...")
    print(e)

In [None]:
# Defining Functions
def pull_pmid_metadata(pmid: list) -> pd.DataFrame:
  """
  This is dependent on E-utilities

  @param pmid: Takes a list of PMIDs produced by function pull_pmid
  @return: Returns a Pandas DataFrame
  @raise keyError: raises an exception
  """
  for i in pmid:
    _temp = f'$HOME/edirect/efetch -db pubmed -id {i} -format xml | $HOME/edirect/xtract -pattern PubmedArticle -block Author \
      -sep " " -tab "| " -element ForeName,LastName'
    _result = !{_temp}
    print(_result)

# Main

In [None]:
%%time
# Install E-utilities, when prompted "Would you like to do that automatically now?" Please select 'y'.
e_utilities_install()

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   665  100   665    0     0   1330      0 --:--:-- --:--:-- --:--:--  1327

Entrez Direct has been successfully downloaded and installed.

In order to complete the configuration process, please execute the following:

  echo "export PATH=\${PATH}:/root/edirect" >> $HOME/.bashrc

or manually edit the PATH variable assignment in your .bashrc file.

Would you like to do that automatically now? [y/N]
y
OK, done.
CPU times: user 386 ms, sys: 87.2 ms, total: 474 ms
Wall time: 1min 6s


In [None]:
%%time 
# Sets up Web Driver
wd = setup_dev_environment()

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:4 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:9 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:12 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Ign:13 https://dev

In [None]:
%%time
pmid_list_2020 = pull_pmid(2020, wd)

CPU times: user 661 ms, sys: 31.8 ms, total: 692 ms
Wall time: 9.81 s


In [None]:
%%time 
pull_pmid_metadata(pmid_list_2020)

['Lawrence Baker| Jason H Maley| Aldo Arévalo| Francis DeMichele| Roselyn Mateo-Collado| Stan Finkelstein| Leo Anthony Celi']
['John Danziger| Miguel Ángel Armengol de la Hoz| Wenyuan Li| Matthieu Komorowski| Rodrigo Octávio Deliberato| Barret N M Rush| Kenneth J Mukamal| Leo Celi| Omar Badawi']
['Marta Fernandes| Rúben Mendes| Susana M Vieira| Francisca Leite| Carlos Palos| Alistair Johnson| Stan Finkelstein| Steven Horng| Leo Anthony Celi']
['Marta Fernandes| Rúben Mendes| Susana M Vieira| Francisca Leite| Carlos Palos| Alistair Johnson| Stan Finkelstein| Steven Horng| Leo Anthony Celi']
['Yuan Lai| Wesley Yeung| Leo Anthony Celi']
['Liam G McCoy| Sujay Nagaraj| Felipe Morgado| Vinyas Harish| Sunit Das| Leo Anthony Celi']
['Stuart McLennan| Leo Anthony Celi| Alena Buyx']
['Trishan Panch| Tom J Pollard| Heather Mattie| Emily Lindemer| Pearse A Keane| Leo Anthony Celi']
['Wesley Yeung| Kennedy Ng| J M Nigel Fong| Judy Sng| Bee Choo Tai| Sin Eng Chia']
CPU times: user 186 ms, sys: 75.6 

In [None]:
# Ignore below
pmid_list_2020

[32612144,
 31948262,
 32126097,
 32240233,
 32248145,
 32577533,
 32449686,
 32577534,
 32432708]

In [None]:
test = !$HOME/edirect/efetch -db pubmed -id 32612144 -format xml

In [None]:
test

['<?xml version="1.0" encoding="UTF-8" ?>',
 '<!DOCTYPE PubmedArticleSet>',
 '<PubmedArticleSet>',
 '  <PubmedArticle>',
 '    <MedlineCitation Status="MEDLINE" Owner="NLM">',
 '      <PMID Version="1">32612144</PMID>',
 '      <DateCompleted>',
 '        <Year>2020</Year>',
 '        <Month>12</Month>',
 '        <Day>04</Day>',
 '      </DateCompleted>',
 '      <DateRevised>',
 '        <Year>2021</Year>',
 '        <Month>07</Month>',
 '        <Day>01</Day>',
 '      </DateRevised>',
 '      <Article PubModel="Electronic">',
 '        <Journal>',
 '          <ISSN IssnType="Electronic">2045-2322</ISSN>',
 '          <JournalIssue CitedMedium="Internet">',
 '            <Volume>10</Volume>',
 '            <Issue>1</Issue>',
 '            <PubDate>',
 '              <Year>2020</Year>',
 '              <Month>07</Month>',
 '              <Day>01</Day>',
 '            </PubDate>',
 '          </JournalIssue>',
 '          <Title>Scientific reports</Title>',
 '          <ISOAbbreviatio

In [None]:
!$HOME/edirect/efetch -db pubmed -id 32612144 -format xml | $HOME/edirect/xtract -pattern PubmedArticle -block DateCompleted -sep " | " -element Year,Month,Day

2020 | 12 | 04


In [None]:
!$HOME/edirect/efetch -db pubmed -id 32612144 -format xml | $HOME/edirect/xtract -pattern PubmedArticle -tab "|" -def "NULL" -sep "," -element MedlineCitation/PMID ArticleTitle -block Author -tab "," -sep " " -element ForeName,LastName

32612144|Real-world characterization of blood glucose control and insulin use in the intensive care unit.|Lawrence Baker,Jason H Maley,Aldo Arévalo,Francis DeMichele,Roselyn Mateo-Collado,Stan Finkelstein,Leo Anthony Celi


In [None]:
from xml.dom import minidom
# https://stackabuse.com/reading-and-writing-xml-files-in-python
a = !$HOME/edirect/efetch -db pubmed -id 32612144 -format xml
mydoc = minidom.parse(a)

AttributeError: ignored

In [None]:
type(a)

IPython.utils.text.SList