FDA Web Scraping Script 
Sheyla Deccarett  
Env: d205  
Updated: 10/7/2023

In [1]:
import requests
from bs4 import BeautifulSoup
 
import pandas as pd

In [5]:
# Website I'm checking
baseURL = 'https://pubmed.ncbi.nlm.nih.gov/?term=%22Hemolysis%22%5Btiab%5D+OR+%22blood+damage%22%5Btiab%5D&filter=years.2015-2024'
print(baseURL)

https://pubmed.ncbi.nlm.nih.gov/?term=%22Hemolysis%22%5Btiab%5D+OR+%22blood+damage%22%5Btiab%5D&filter=years.2015-2024


In [20]:
# Send an HTTP request (internet browser asks server for info needed to load the website), save the results in 'response'
response = requests.get(baseURL)
print(response)

<Response [200]>


In [22]:
# Interpret response.content with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser') #lxml = library to parse HTML and other languages
print(soup)

<!DOCTYPE html>

<html lang="en">
<head itemscope="" itemtype="http://schema.org/WebPage" prefix="og: http://ogp.me/ns#">
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<!-- Mobile properties -->
<meta content="True" name="HandheldFriendly"/>
<meta content="320" name="MobileOptimized"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<link href="https://cdn.ncbi.nlm.nih.gov" rel="preconnect"/>
<link href="https://www.ncbi.nlm.nih.gov" rel="preconnect"/>
<link href="https://www.google-analytics.com" rel="preconnect"/>
<link href="https://cdn.ncbi.nlm.nih.gov/pubmed/62ee429b-2e4c-4716-a99e-7c9b4b5ec32b/CACHE/css/output.5ecf62baa0fa.css" rel="stylesheet" type="text/css"/>
<link href="https://cdn.ncbi.nlm.nih.gov/pubmed/62ee429b-2e4c-4716-a99e-7c9b4b5ec32b/CACHE/css/output.452c70ce66f7.css" rel="stylesheet" type="text/css"/>
<link href="https://cdn.ncbi.nlm.nih.gov/pubmed/62ee429b-2e4c-4716-a99e-7c9b4b5ec32b/CACHE/css/output.55dd827ca

In [7]:
# Initialize lists to store data
titles = []
authors_list = []  # Use a different variable name to avoid overwriting
journals = []
pmids = []
publication_times = []  # Define an empty list
affiliations_list = []  # Define an empty list

In [52]:
articles = soup.find_all('div', class_='docsum-content')
for article in articles:
    title = article.find('a', class_='docsum-title').text.strip()
    authors = article.find('span', class_='docsum-authors full-authors').text.strip()
    journal = article.find('span', class_='docsum-journal-citation full-journal-citation').text.strip()
    pmid = article.find('span', class_='citation-part').text.strip()
    time = article.find('span', class_='docsum-journal-citation full-journal-citation').text.strip()
 
    # Check if affiliations element exists before accessing its text
    affiliations_elem = article.find('span', class_='docsum-affiliation')
    affiliations = affiliations_elem.text.strip() if affiliations_elem else ""
 
    # Append data to respective lists
    titles.append(title)
    authors_list.append(authors)
    journals.append(journal)
    pmids.append(pmid)
    publication_times.append(time)
    affiliations_list.append(affiliations)

In [58]:
for article in articles:
    affiliations_elem = article.find('span', class_='docsum-affiliation')
    print(affiliations_elem)

None
None
None
None
None
None
None
None
None
None


In [53]:
for article in articles: print(article)

<div class="docsum-content">
<a class="docsum-title" data-article-id="32119415" data-full-article-url="from_term=%22Hemolysis%22%5Btiab%5D+OR+%22blood+damage%22%5Btiab%5D&amp;from_filter=years.2015-2024&amp;from_pos=1" data-ga-action="1" data-ga-category="result_click" data-ga-label="32119415" href="/32119415/" ref="linksrc=docsum_link&amp;article_id=32119415&amp;ordinalpos=1&amp;page=1">
              
                Streptococcus Pyogenes.
              
            </a>
<div class="docsum-citation full-citation">
<span class="docsum-authors full-authors">Kanwal S, Vaitla P.</span>
<span class="docsum-authors short-authors">Kanwal S, et al.</span>
<span class="docsum-journal-citation full-journal-citation">2023 Jul 31. In: StatPearls [Internet]. Treasure Island (FL): StatPearls Publishing; 2023 Jan–.</span>
<span class="docsum-journal-citation short-journal-citation">2023 Jul 31. In: StatPearls [Internet]. Treasure Island (FL): StatPearls Publishing; 2023 Jan–.</span>
<span class="c

In [54]:
titles

['Streptococcus Pyogenes.',
 'Managing hemolyzed samples in clinical laboratories.',
 'Causes, consequences and management of sample hemolysis in the clinical laboratory.',
 'Clinical Applications of Hemolytic Markers in the Differential Diagnosis and Management of Hemolytic Anemia.',
 'Mechanism of megaloblastic anemia combined with hemolysis.',
 'Mechanical blood trauma in assisted circulation: sublethal RBC damage preceding hemolysis.',
 'Blood damage in ventricular assist devices.',
 'Factors associated with hemolysis during extracorporeal membrane oxygenation (ECMO)-Comparison of VA- versus VV ECMO.',
 'Blood damage in ventricular assist devices.',
 '[Hemolytic disorders and venous thrombosis: An update].']

In [55]:
# Create a DataFrame to store the data
data = {
    'Title': titles,
    'Authors': authors,
    'Journal': journals,
    'PMID': pmids,
    'Publication Time': publication_times,
    'Affiliations': affiliations
}
df = pd.DataFrame(data)
 
# Display the DataFrame
print(df)

                                               Title  \
0                            Streptococcus Pyogenes.   
1  Managing hemolyzed samples in clinical laborat...   
2  Causes, consequences and management of sample ...   
3  Clinical Applications of Hemolytic Markers in ...   
4  Mechanism of megaloblastic anemia combined wit...   
5  Mechanical blood trauma in assisted circulatio...   
6        Blood damage in ventricular assist devices.   
7  Factors associated with hemolysis during extra...   
8        Blood damage in ventricular assist devices.   
9  [Hemolytic disorders and venous thrombosis: An...   

                                             Authors  \
0  Lecouffe-Desprets M, Graveleau J, Artifoni M, ...   
1  Lecouffe-Desprets M, Graveleau J, Artifoni M, ...   
2  Lecouffe-Desprets M, Graveleau J, Artifoni M, ...   
3  Lecouffe-Desprets M, Graveleau J, Artifoni M, ...   
4  Lecouffe-Desprets M, Graveleau J, Artifoni M, ...   
5  Lecouffe-Desprets M, Graveleau J, Artifoni M

In [57]:
df

Unnamed: 0,Title,Authors,Journal,PMID,Publication Time,Affiliations
0,Streptococcus Pyogenes.,"Lecouffe-Desprets M, Graveleau J, Artifoni M, ...",2023 Jul 31. In: StatPearls [Internet]. Treasu...,PMID: 32119415,2023 Jul 31. In: StatPearls [Internet]. Treasu...,
1,Managing hemolyzed samples in clinical laborat...,"Lecouffe-Desprets M, Graveleau J, Artifoni M, ...",Crit Rev Clin Lab Sci. 2020 Jan;57(1):1-21. do...,PMID: 31603708,Crit Rev Clin Lab Sci. 2020 Jan;57(1):1-21. do...,
2,"Causes, consequences and management of sample ...","Lecouffe-Desprets M, Graveleau J, Artifoni M, ...",Clin Biochem. 2017 Dec;50(18):1317-1322. doi: ...,PMID: 28947321,Clin Biochem. 2017 Dec;50(18):1317-1322. doi: ...,
3,Clinical Applications of Hemolytic Markers in ...,"Lecouffe-Desprets M, Graveleau J, Artifoni M, ...",Dis Markers. 2015;2015:635670. doi: 10.1155/20...,PMID: 26819490,Dis Markers. 2015;2015:635670. doi: 10.1155/20...,
4,Mechanism of megaloblastic anemia combined wit...,"Lecouffe-Desprets M, Graveleau J, Artifoni M, ...",Bioengineered. 2021 Dec;12(1):6703-6712. doi: ...,PMID: 34542005,Bioengineered. 2021 Dec;12(1):6703-6712. doi: ...,
5,Mechanical blood trauma in assisted circulatio...,"Lecouffe-Desprets M, Graveleau J, Artifoni M, ...",Int J Artif Organs. 2016 Jun 15;39(4):150-9. d...,PMID: 27034320,Int J Artif Organs. 2016 Jun 15;39(4):150-9. d...,
6,Blood damage in ventricular assist devices.,"Lecouffe-Desprets M, Graveleau J, Artifoni M, ...",Int J Artif Organs. 2019 Mar;42(3):111-112. do...,PMID: 30862276,Int J Artif Organs. 2019 Mar;42(3):111-112. do...,
7,Factors associated with hemolysis during extra...,"Lecouffe-Desprets M, Graveleau J, Artifoni M, ...",PLoS One. 2020 Jan 27;15(1):e0227793. doi: 10....,PMID: 31986168,PLoS One. 2020 Jan 27;15(1):e0227793. doi: 10....,
8,Blood damage in ventricular assist devices.,"Lecouffe-Desprets M, Graveleau J, Artifoni M, ...",Int J Artif Organs. 2016 Jun 15;39(4):147-9. d...,PMID: 27323915,Int J Artif Organs. 2016 Jun 15;39(4):147-9. d...,
9,[Hemolytic disorders and venous thrombosis: An...,"Lecouffe-Desprets M, Graveleau J, Artifoni M, ...",Rev Med Interne. 2019 Apr;40(4):232-237. doi: ...,PMID: 30773236,Rev Med Interne. 2019 Apr;40(4):232-237. doi: ...,


In [62]:
baseurl = 'https://pubmed.ncbi.nlm.nih.gov/'
affiliations = requests.get(f'{baseurl} + df.iloc[0:3]')
affiliations
                           

<Response [404]>

In [76]:
affiliations_url = []
for i in range(len(df['PMID'])):
    search_url = f'{baseurl}' + df.iloc[i, 3][6:]
    print(search_url)
    affiliations_url.append(search_url)
affiliations_url

https://pubmed.ncbi.nlm.nih.gov/32119415
https://pubmed.ncbi.nlm.nih.gov/31603708
https://pubmed.ncbi.nlm.nih.gov/28947321
https://pubmed.ncbi.nlm.nih.gov/26819490
https://pubmed.ncbi.nlm.nih.gov/34542005
https://pubmed.ncbi.nlm.nih.gov/27034320
https://pubmed.ncbi.nlm.nih.gov/30862276
https://pubmed.ncbi.nlm.nih.gov/31986168
https://pubmed.ncbi.nlm.nih.gov/27323915
https://pubmed.ncbi.nlm.nih.gov/30773236


['https://pubmed.ncbi.nlm.nih.gov/32119415',
 'https://pubmed.ncbi.nlm.nih.gov/31603708',
 'https://pubmed.ncbi.nlm.nih.gov/28947321',
 'https://pubmed.ncbi.nlm.nih.gov/26819490',
 'https://pubmed.ncbi.nlm.nih.gov/34542005',
 'https://pubmed.ncbi.nlm.nih.gov/27034320',
 'https://pubmed.ncbi.nlm.nih.gov/30862276',
 'https://pubmed.ncbi.nlm.nih.gov/31986168',
 'https://pubmed.ncbi.nlm.nih.gov/27323915',
 'https://pubmed.ncbi.nlm.nih.gov/30773236']

In [169]:
try:
    for affiliation_url in affiliations_url:
        content = requests.get(affiliation_url)
        soup2 = BeautifulSoup(content.text, 'html.parser')
        unique_affiliations = []
        affiliations = soup2.find_all('a', class_ = 'affiliation-link')
        for aff in affiliations:
            title = aff['title']
            if title in unique_affiliations: continue
            else: unique_affiliations.append(title)
        print(unique_affiliations)
except:
    print(f'Something wrong with {affiliation_url}')
        #print(content.text)

['Allama Iqbal Medical College, Lahore, Pakistan', 'University of Mississippi Medical Center']
['Department of Medical Laboratory Diagnostics, University Hospital "Sveti Duh", University of Zagreb, Faculty of Pharmacy and Biochemistry, Zagreb, Croatia.', 'Department of Laboratory Medicine, University of Washington, Seattle, WA, USA.', 'Department of Laboratory Medicine, Paracelsus Medical University Salzburg, Salzburg, Austria.', 'Department of Clinical Biochemistry, Cork University Hospital, Cork, Republic of Ireland.', 'Section of Clinical Biochemistry, University of Verona, Verona, Italy.']
['Department of Laboratory Medicine, Ziekenhuis Netwerk Antwerpen, Antwerp, Belgium. Electronic address: heireman.laura@gmail.com.', 'Department of Orthopedic Surgery, Ziekenhuis Netwerk Antwerpen, Antwerp, Belgium.', 'Department of Laboratory Medicine, Ziekenhuis Netwerk Antwerpen, Antwerp, Belgium.']
["U.O. Oncoematologia, Fondazione IRCCS Ca' Granda Ospedale Maggiore Policlinico di Milano, Via

In [140]:
soup2 = BeautifulSoup(content.text, 'html.parser')
print(soup2)

<!DOCTYPE html>

<html lang="en">
<head itemscope="" itemtype="http://schema.org/WebPage" prefix="og: http://ogp.me/ns#">
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<!-- Mobile properties -->
<meta content="True" name="HandheldFriendly"/>
<meta content="320" name="MobileOptimized"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<link href="https://cdn.ncbi.nlm.nih.gov" rel="preconnect"/>
<link href="https://www.ncbi.nlm.nih.gov" rel="preconnect"/>
<link href="https://www.google-analytics.com" rel="preconnect"/>
<link href="https://cdn.ncbi.nlm.nih.gov/pubmed/62ee429b-2e4c-4716-a99e-7c9b4b5ec32b/CACHE/css/output.5ecf62baa0fa.css" rel="stylesheet" type="text/css"/>
<link href="https://cdn.ncbi.nlm.nih.gov/pubmed/62ee429b-2e4c-4716-a99e-7c9b4b5ec32b/CACHE/css/output.452c70ce66f7.css" rel="stylesheet" type="text/css"/>
<link href="https://cdn.ncbi.nlm.nih.gov/pubmed/62ee429b-2e4c-4716-a99e-7c9b4b5ec32b/CACHE/css/output.55dd827ca

In [166]:
unique_affiliations = []
affiliations = soup2.find_all('a', class_ = 'affiliation-link')
for aff in affiliations:
    title = aff['title']
    if title in unique_affiliations: continue
    else: unique_affiliations.append(title)
print(unique_affiliations)

['Service de médecine interne, CHU de Nantes, 1, place Alexis-Ricordeau, 44093 Nantes, France.',
 'Service de médecine interne, CHU de Nantes, 1, place Alexis-Ricordeau, 44093 Nantes, France. Electronic address: antoine.neel@univ-nantes.fr.']