In [44]:
from datetime import timedelta
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re
import neattext as nt

bs4 Web Scraping for ARXIV

In [31]:
def scrape_arxiv(query, num_pages):
    base_url = "https://arxiv.org"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
    }

    all_papers = []
    for page in range(1, num_pages+1):
        url = f"{base_url}/search/?query={query}&searchtype=all&source=header&start={50*(page-1)}"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        papers = []
        for result in soup.find_all('li', class_='arxiv-result'):
            title = result.find('p', class_='title is-5 mathjax').text.strip()
            abstract = result.find('span', class_='abstract-full has-text-grey-dark mathjax').text.strip()
            authors = result.find('p', class_='authors').text.strip()
            pub_date = result.find('p', class_='is-size-7').text.strip().split(': ')[-1]
            # Extract tags with their data-tooltip attribute
            tags = [span['data-tooltip'] for span in result.find_all('span', class_=lambda x: x and 'tag is-small' in x)]
            papers.append({'Title': title, 'Abstract': abstract, 'Authors': authors, 'Publication Date': pub_date, 'Tags': tags})
        
        all_papers.extend(papers)

    return pd.DataFrame(all_papers)

# EDIT the search query here
query = 'Engineering'

# EDIT the number of papers to scrape here
num_pages = 10

df = scrape_arxiv(query, num_pages)
print(df.shape)
df

(500, 5)


Unnamed: 0,Title,Abstract,Authors,Publication Date,Tags
0,Raman-phonon-polariton condensation in a trans...,Phonon polaritons are hybrid states of light a...,"Authors:\nAlexander N. Bourzutschky, \n \...","Submitted 8 May, 2024; \n originally anno...","[Mesoscale and Nanoscale Physics, Strongly Cor..."
1,Attention-Driven Training-Free Efficiency Enha...,Diffusion Models (DMs) have exhibited superior...,"Authors:\nHongjie Wang, \n \n Difan ...","Submitted 8 May, 2024; \n originally anno...","[Computer Vision and Pattern Recognition, Arti..."
2,SVDD Challenge 2024: A Singing Voice Deepfake ...,The rapid advancement of AI-generated singing ...,"Authors:\nYou Zhang, \n \n Yongyi Za...","Submitted 8 May, 2024; \n originally anno...","[Audio and Speech Processing, Artificial Intel..."
3,An LSTM-Based Chord Generation System Using Ch...,This paper proposes a system for chord generat...,Authors:\nJack Hardwick,"Submitted 8 May, 2024; \n originally anno...","[Sound, Machine Learning, Audio and Speech Pro..."
4,Cellular Traffic Prediction Using Online Predi...,The advent of 5G technology promises a paradig...,"Authors:\nHossein Mehri, \n \n Hao C...","Submitted 8 May, 2024; \n originally anno...","[Systems and Control, Machine Learning]"
...,...,...,...,...,...
495,Model Predictive Guidance for Fuel-Optimal Lan...,This paper introduces a landing guidance strat...,"Authors:\nKi-Wook Jung, \n \n Sang-D...","Submitted 2 May, 2024; \n originally anno...",[Systems and Control]
496,Causal Influence in Federated Edge Inference,"In this paper, we consider a setting where het...","Authors:\nMert Kayaalp, \n \n Yunus ...","Submitted 2 May, 2024; \n originally anno...","[Machine Learning, Multiagent Systems, Signal ..."
497,Towards Consistent Object Detection via LiDAR-...,As human-machine interaction continues to evol...,"Authors:\nKai Luo, \n \n Hao Wu, \n ...","Submitted 2 May, 2024; \n originally anno...","[Computer Vision and Pattern Recognition, Robo..."
498,Prompt engineering paradigms for medical appli...,Prompt engineering is crucial for harnessing t...,"Authors:\nJamil Zaghir, \n \n Marco ...","Submitted 2 May, 2024; \n originally anno...","[Computation and Language, Machine Learning]"


bs4 Web Scraping for PubMed

In [48]:
def scrape_pubmed(query):
    url = f"https://pubmed.ncbi.nlm.nih.gov/?term={query}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    articles = []
    for article in soup.find_all('article', class_='full-docsum'):

        lang = article.find('span', class_ = 'language spaced-citation-item citation-part')
        if lang and len(lang):
            continue

        title = article.find('a', class_='docsum-title')
        title = nt.remove_html_tags(title.text) if title else None
        title = title.replace('\n', '').strip()
        authors = article.find('span', class_='docsum-authors')
        tags_str = article.find('span', class_='docsum-journal-citation full-journal-citation').find_next('span').text.strip()
        tags = re.findall(r'\b[A-Z]+\b', tags_str.upper()) if tags_str else []
        journal = article.find('span', class_='docsum-journal-citation full-journal-citation').text.strip()
        date_match = re.search(r'\d{4}(?: [A-Z][a-z]+(?: \d{1,2})?)?', journal)
        date = date_match.group() if date_match else None
        abstract = article.find('div', class_='full-view-snippet')
        abstract = nt.remove_html_tags(abstract.text) if abstract else None
        abstract = abstract.replace('\n', '').strip()
        pmid = article.find('span', class_='docsum-pmid')
        
        articles.append({
            'Title': title,
            'Authors': authors,
            'Tags': tags,
            'Date': date,
            'Abstract': abstract,
            'PMID': pmid
        })
    
    return pd.DataFrame(articles)

# Scrape PubMed search results for the query "Engineering"
query = "Engineering"
df = scrape_pubmed(query)
df

Unnamed: 0,Title,Authors,Tags,Date,Abstract,PMID
0,Engineering ethics within accident analysis mo...,"[Haghighattalab S, Chen A, Fan Y, Mohammadi R.]","[ACCID, ANAL, PREV]",2019 Aug,Then engineering ethics as an element of human...,[31150918]
1,"Bio-enabled Engineering of Multifunctional ""Li...","[Arnold DP, Takatori SC.]","[ACS, NANO]",2023 Jun 27,"Using active matter surfaces, for example, our...",[37294942]
2,Mini-review: Rehabilitation engineering: Resea...,"[Satpute S, Cooper R, Dicianno BE, Joseph J, C...","[NEUROSCI, LETT]",2021 Nov 1,Rehabilitation Engineering is the use of engin...,[34478814]
3,Engineering microbial diagnostics and therapeu...,"[Amrofell MB, Rottinghaus AG, Moon TS.]","[CURR, OPIN, BIOTECHNOL]",2020 Dec,Microbes have become an increasingly powerful ...,[32563763]
4,Synthetic biology-inspired cell engineering in...,"[Zhao N, Song Y, Xie X, Zhu Z, Duan C, Nong C,...","[SIGNAL, TRANSDUCT, TARGET, THER]",2023 Mar 11,Such cell engineering resources can play a cri...,[36906608]
5,"Chemical Engineering in the ""BIO"" World.","[Chiarappa G, Grassi M, Abrami M, Abbiati RA, ...","[CURR, DRUG, DELIV]",2017,"Whitaker, in 1914, affirmed that the differenc...",[27264726]
6,Engineering and Evolution of Saccharomyces cer...,"[Turner TL, Kim H, Kong II, Liu JJ, Zhang GC, ...","[ADV, BIOCHEM, ENG, BIOTECHNOL]",2018,"As well as shifting away from fossil fuels, th...",[27913828]
7,Biomaterials for Bone Regenerative Engineering.,"[Yu X, Tang X, Gohil SV, Laurencin CT.]","[ADV, HEALTHC, MATER]",2015 Jun 24,Strategies for bone tissue regeneration have b...,[25846250]
8,CRISPR/Cas-based Human T cell Engineering: Bas...,"[Bernard BE, Landmann E, Jeker LT, Schumann K.]","[IMMUNOL, LETT]",2022 May,Engineering human T cells for the treatment of...,[35358611]
