In [154]:
from datetime import timedelta
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

Web Scraping for ARXIV

In [155]:
def scrape_arxiv(query, num_pages):
    base_url = "https://arxiv.org"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
    }

    all_papers = []
    for page in range(1, num_pages+1):
        url = f"{base_url}/search/?query={query}&searchtype=all&source=header&start={50*(page-1)}"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        papers = []
        for result in soup.find_all('li', class_='arxiv-result'):
            title = result.find('p', class_='title is-5 mathjax').text.strip()
            abstract = result.find('span', class_='abstract-full has-text-grey-dark mathjax').text.strip()
            authors = result.find('p', class_='authors').text.strip()
            pub_date = result.find('p', class_='is-size-7').text.strip().split(': ')[-1]
            # Extract tags with their data-tooltip attribute
            tags = [span['data-tooltip'] for span in result.find_all('span', class_=lambda x: x and 'tag is-small' in x)]
            papers.append({'Title': title, 'Abstract': abstract, 'Authors': authors, 'Publication Date': pub_date, 'Tags': tags})
        
        all_papers.extend(papers)

    return pd.DataFrame(all_papers)

# EDIT the search query here
query = 'Engineering'

# EDIT the number of papers to scrape here
num_pages = 10

df = scrape_arxiv(query, num_pages)
print(df.shape)
df

(500, 5)


Unnamed: 0,Title,Abstract,Authors,Publication Date,Tags
0,Raman-phonon-polariton condensation in a trans...,Phonon polaritons are hybrid states of light a...,"Authors:\nAlexander N. Bourzutschky, \n \...","Submitted 8 May, 2024; \n originally anno...","[Mesoscale and Nanoscale Physics, Strongly Cor..."
1,Attention-Driven Training-Free Efficiency Enha...,Diffusion Models (DMs) have exhibited superior...,"Authors:\nHongjie Wang, \n \n Difan ...","Submitted 8 May, 2024; \n originally anno...","[Computer Vision and Pattern Recognition, Arti..."
2,SVDD Challenge 2024: A Singing Voice Deepfake ...,The rapid advancement of AI-generated singing ...,"Authors:\nYou Zhang, \n \n Yongyi Za...","Submitted 8 May, 2024; \n originally anno...","[Audio and Speech Processing, Artificial Intel..."
3,An LSTM-Based Chord Generation System Using Ch...,This paper proposes a system for chord generat...,Authors:\nJack Hardwick,"Submitted 8 May, 2024; \n originally anno...","[Sound, Machine Learning, Audio and Speech Pro..."
4,Cellular Traffic Prediction Using Online Predi...,The advent of 5G technology promises a paradig...,"Authors:\nHossein Mehri, \n \n Hao C...","Submitted 8 May, 2024; \n originally anno...","[Systems and Control, Machine Learning]"
...,...,...,...,...,...
495,Model Predictive Guidance for Fuel-Optimal Lan...,This paper introduces a landing guidance strat...,"Authors:\nKi-Wook Jung, \n \n Sang-D...","Submitted 2 May, 2024; \n originally anno...",[Systems and Control]
496,Causal Influence in Federated Edge Inference,"In this paper, we consider a setting where het...","Authors:\nMert Kayaalp, \n \n Yunus ...","Submitted 2 May, 2024; \n originally anno...","[Machine Learning, Multiagent Systems, Signal ..."
497,Towards Consistent Object Detection via LiDAR-...,As human-machine interaction continues to evol...,"Authors:\nKai Luo, \n \n Hao Wu, \n ...","Submitted 2 May, 2024; \n originally anno...","[Computer Vision and Pattern Recognition, Robo..."
498,Prompt engineering paradigms for medical appli...,Prompt engineering is crucial for harnessing t...,"Authors:\nJamil Zaghir, \n \n Marco ...","Submitted 2 May, 2024; \n originally anno...","[Computation and Language, Machine Learning]"


REST API Scraping for IEEE

In [164]:
def scrape_ieee(query, num_pages):
    url = "https://ieeexplore.ieee.org/rest/search"
    headers = {
        "accept": "application/json, text/plain, */*",
        "accept-language": "th-TH,th;q=0.9",
        "content-type": "application/json",
        "origin": "https://ieeexplore.ieee.org",
        "priority": "u=1, i",
        "referer": f"https://ieeexplore.ieee.org/search/searchresult.jsp?newsearch=true&queryText={query}",
        "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "x-security-request": "required"
    }

    all_records = []
    for page in range(1, num_pages+1):
        data = {
            "newsearch": True,
            "queryText": query,
            "highlight": True,
            "returnFacets": ["ALL"],
            "returnType": "SEARCH",
            "matchPubs": True,
            "pageNumber": page
        }

        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()  # Raise an exception for unsuccessful requests
        
        data = response.json()
        records = data.get('records', [])
        all_records.extend(records)

    # Create DataFrame from all records
    df = pd.json_normalize(all_records)
    return df

# EDIT the search query here
query = "Engineering"

# EDIT pagination here
num_pages = 1

selected_columns = ['authors', 'publicationNumber', 'publicationDate', 'articleNumber',
                    'articleTitle', 'downloadCount',  'abstract', 'articleContentType']
df = scrape_ieee(query, num_pages)
selected_df = df[selected_columns]

def scrape_each_author(authorId):
    url = f"https://ieeexplore.ieee.org/rest/author/{authorId}"
    headers = {
        "accept": "application/json, text/plain, */*",
        "accept-language": "th-TH,th;q=0.9",
        "content-type": "application/json",
        "origin": "https://ieeexplore.ieee.org",
        "priority": "u=1, i",
        "referer": "https://ieeexplore.ieee.org/search/searchresult.jsp?newsearch=true&queryText=Engineering",
        "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"
    }

    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Raise an exception for unsuccessful requests

    data = response.json()
    if not len(data):
        return None
    currentAffilations = data[0].get('currentAffiliations', [])
    if not len(currentAffilations):
        return None
    currentAffilations = currentAffilations[0]
    country = currentAffilations.split(', ')
    if not len(country):
        return None
    country = country[-1]
    return country

selected_df['authorsName'] = selected_df['authors'].apply(lambda x: [author['preferredName'] for author in x])
selected_df['authorsAffilationCountry'] = selected_df['authors'].apply(lambda x: [scrape_each_author(author['id']) for author in x])
selected_df = selected_df.drop(columns=['authors'], axis=1)
print(selected_df.shape)
selected_df

(25, 9)


Unnamed: 0,publicationNumber,publicationDate,articleNumber,articleTitle,downloadCount,abstract,articleContentType,authorsName,authorsAffilationCountry
0,8718248,8-11 April 2019,8725097,A System Engineering Approach in Orienting Tra...,837,The present work presents a system [::engineer...,Conferences,"[Adel Alblawi, Mohammad Nawab, Abdulaziz Alsay...","[Saudi Arabia, Kingdom of Saudi Arabia, Kingdo..."
1,8564422,27-29 June 2018,8593453,Portuguese academic staff and students in UK’s...,97,"Before the EU referendum in June 2016, the pro...",Conferences,"[Inês Direito, Stella Fowler]","[Portugal, United Kingdom]"
2,10,July 2013,6519290,Medical and Biological Engineering in the Next...,6755,"In 2011, the American Institute for Medical an...",Journals,[College of Fellows American Institute for Med...,[USA]
3,9121928,27-30 April 2020,9125119,Bridging the gaps in engineering curriculum th...,166,This paper presents a system [::engineering::]...,Conferences,"[Mohammad Nawab, Adel Alblawi, Abdulaziz Alsay...","[Kingdom of Saudi Arabia, Saudi Arabia, Kingdo..."
4,7474421,5-6 April 2016,7474475,"Strengthening the ""Engineering"" in Software En...",2385,"In the fall of 2015, Stevens Institute of Tech...",Conferences,[Linda Laird],[USA]
5,9657196,15-18 Nov. 2021,9657437,Evaluation of Engineering Ethics in the Mechan...,171,"In recent times, concerns have been raised abo...",Conferences,"[Paula O. V. Henry, Earle A. Wilson, Trevor G....","[Jamaica, Jamaica, Jamaica]"
6,6636319,26-29 Aug. 2013,6654424,Biotronic Engineering curriculum design: Integ...,107,A specialised major in Biotronic [::Engineerin...,Conferences,"[Hamid GholamHosseini, Krishnamachar Prasad]","[New Zealand, New Zealand]"
7,8454953,13-16 Nov. 2017,8467167,A review on issues and challenges in incorpora...,150,"In the 21st Century, there is an increasing de...",Conferences,"[Fathiyah Mohd Kamaruzaman, Roszilah Hamid, Az...","[Malaysia, Malaysia, Malaysia]"
8,9429076,14-17 March 2021,9429123,Plenary: Map of Generic Competences in Enginee...,37,The Plenary Agreement of the Council of Argent...,Conferences,"[Sandra Daniela Cirimelo, Mónica Pascual, Robe...","[Argentina, Argentina, Argentina, Argentina]"
9,13,April 2024,10453596,Engineering Identity and Smartness Identity as...,129,Contribution: This study examined the role of ...,Journals,"[Cassie Wallwey, Emily Dringenberg, Bailey Bra...","[USA, USA, USA, USA, USA]"
