In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

# Scraping

In [2]:
# Navigate to PubMed
driver = webdriver.Chrome("chromedriver.exe") # Need to download this before hand from (https://chromedriver.chromium.org/downloads)
original_url = "https://pubmed.ncbi.nlm.nih.gov/"
driver.get(original_url)

# Find the search box and input topic
search = driver.find_element(By.NAME, "term")
topic = "Response to immune checkpoint inhibitors" # change this to your own specifications
search.send_keys(topic)
search.send_keys(Keys.RETURN)
search_url = driver.current_url
print(search_url)
r = requests.get(search_url)
soup = BeautifulSoup(r.content, "html.parser")
results = soup.find(id="search-results")
articles = results.find_all("a", class_="docsum-title")

# Find max number of pages
page_search_max = soup.find(id="bottom-page-number-input")
page_search_max = str(page_search_max)
max_index = page_search_max.find("max=") + 4
max_str = page_search_max[max_index:164]
max_str = max_str.replace('"', ' ').strip()
print(f"Total Pages: {max_str}")
max_int = int(max_str) # you can change max_int to equal the number of pages you want scraped. Each page has about 10 articles


# List of article names and links 
article_title_lst = []
article_links = []
full_article_links = []

for i in range(max_int): # Loop through and scrape each page
    print(f"Page: {i+1}")
    page_search = driver.find_element(By.ID, "bottom-page-number-input")
    page_search.clear()
    page_search.send_keys(i+1)
    page_search.send_keys(Keys.RETURN)
    new_url = driver.current_url
    r = requests.get(new_url)
    soup = BeautifulSoup(r.content, "html.parser")
    results = soup.find(id="search-results")
    articles = results.find_all("a", class_="docsum-title")
    
    for link in articles:   # Get abstract links
        link = link["href"]
        interest_article = f"{original_url}{link}"
        article_links.append(interest_article)
    
    for link in article_links:  # Get full article links
        r = requests.get(link)
        actual_article = BeautifulSoup(r.content, "html.parser")
        actual_article = actual_article.find(class_="link-item")
        actual_article_link = actual_article["href"]
        full_article_links.append(actual_article_link)
            
    for title in articles:  # Get article names
        article_title = title
        article_title = article_title.text.replace('\n\n', '\n').replace('  ','').lstrip().splitlines()
        article_title_lst.append(article_title)
    break # remove this line to get articles on the next pages
        
driver.close()

  driver = webdriver.Chrome("chromedriver.exe") # Need to download this before hand from (https://chromedriver.chromium.org/downloads)


https://pubmed.ncbi.nlm.nih.gov/?term=Response+to+immune+checkpoint+inhibitors
Total Pages: 1000
Page: 1


# Export to CSV

In [3]:
# Make a DF with article names and lists and export
article_df = pd.DataFrame(article_title_lst)
article_df[1] = pd.DataFrame(article_links)
article_df[2] = pd.DataFrame(full_article_links)
df = article_df.rename(columns = {"0" : "Title", "1" : "Link"})
# display(df)

df.to_csv(f"{topic} Articles.csv")
clean_df = pd.read_csv(f"{topic} Articles.csv", index_col = "0")
clean_df.drop("Unnamed: 0", axis = 1, inplace = True)
clean_df = clean_df.rename(columns = {"1" : "Abstract", "2" : "Full Article"})
clean_df.index.names = ["Title"]
display(clean_df)
clean_df.to_csv(f"{topic} Articles.csv") # Sends Articles to your Desktop

Unnamed: 0_level_0,Abstract,Full Article
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
PD-L1 as a biomarker of response to immune-checkpoint inhibitors.,https://pubmed.ncbi.nlm.nih.gov//33580222/,https://doi.org/10.1038/s41571-021-00473-5
Immune Checkpoint Inhibitors for the Treatment of Cancer: Clinical Impact and Mechanisms of Response and Resistance.,https://pubmed.ncbi.nlm.nih.gov//33197221/,https://arjournals.annualreviews.org/doi/10.11...
"Hallmarks of response, resistance, and toxicity to immune checkpoint blockade.",https://pubmed.ncbi.nlm.nih.gov//34624224/,https://linkinghub.elsevier.com/retrieve/pii/S...
Immune checkpoint inhibitors in melanoma.,https://pubmed.ncbi.nlm.nih.gov//34509219/,https://linkinghub.elsevier.com/retrieve/pii/S...
Immunostimulation with chemotherapy in the era of immune checkpoint inhibitors.,https://pubmed.ncbi.nlm.nih.gov//32760014/,https://doi.org/10.1038/s41571-020-0413-z
Single-cell analyses reveal key immune cell subsets associated with response to PD-L1 blockade in triple-negative breast cancer.,https://pubmed.ncbi.nlm.nih.gov//34653365/,https://linkinghub.elsevier.com/retrieve/pii/S...
Epigenetic Mechanisms of Resistance to Immune Checkpoint Inhibitors.,https://pubmed.ncbi.nlm.nih.gov//32708698/,https://www.mdpi.com/resolver?pii=biom10071061
Predictive biomarkers of response to immune checkpoint inhibitors.,https://pubmed.ncbi.nlm.nih.gov//32646345/,https://dx.doi.org/10.7399/fh.11328
Role of Immunotherapy in Triple-Negative Breast Cancer.,https://pubmed.ncbi.nlm.nih.gov//32259782/,https://jnccn.org/doi/10.6004/jnccn.2020.7554
Predictive biomarkers of response to immune checkpoint inhibitors in melanoma.,https://pubmed.ncbi.nlm.nih.gov//31997676/,https://www.tandfonline.com/doi/full/10.1080/1...
