In [58]:
from datetime import timedelta
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [67]:
def scrape_arxiv(query, num_pages):
    base_url = "https://arxiv.org"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
    }

    all_papers = []
    for page in range(1, num_pages+1):
        url = f"{base_url}/search/?query={query}&searchtype=all&source=header&start={50*(page-1)}"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        papers = []
        for result in soup.find_all('li', class_='arxiv-result'):
            title = result.find('p', class_='title is-5 mathjax').text.strip()
            abstract = result.find('span', class_='abstract-full has-text-grey-dark mathjax').text.strip()
            authors = result.find('p', class_='authors').text.strip()
            pub_date = result.find('p', class_='is-size-7').text.strip().split(': ')[-1]
            # Extract tags with their data-tooltip attribute
            tags = [span['data-tooltip'] for span in result.find_all('span', class_=lambda x: x and 'tag is-small' in x)]
            papers.append({'Title': title, 'Abstract': abstract, 'Authors': authors, 'Publication Date': pub_date, 'Tags': tags})
        
        all_papers.extend(papers)

    return pd.DataFrame(all_papers)

# EDIT the search query here
query = 'Engineering'

# EDIT the number of papers to scrape here
num_pages = 10

df = scrape_arxiv(query, num_pages)
print(df.shape)
df

(500, 5)


Unnamed: 0,Title,Abstract,Authors,Publication Date,Tags
0,Raman-phonon-polariton condensation in a trans...,Phonon polaritons are hybrid states of light a...,"Authors:\nAlexander N. Bourzutschky, \n \...","Submitted 8 May, 2024; \n originally anno...","[Mesoscale and Nanoscale Physics, Strongly Cor..."
1,Attention-Driven Training-Free Efficiency Enha...,Diffusion Models (DMs) have exhibited superior...,"Authors:\nHongjie Wang, \n \n Difan ...","Submitted 8 May, 2024; \n originally anno...","[Computer Vision and Pattern Recognition, Arti..."
2,SVDD Challenge 2024: A Singing Voice Deepfake ...,The rapid advancement of AI-generated singing ...,"Authors:\nYou Zhang, \n \n Yongyi Za...","Submitted 8 May, 2024; \n originally anno...","[Audio and Speech Processing, Artificial Intel..."
3,An LSTM-Based Chord Generation System Using Ch...,This paper proposes a system for chord generat...,Authors:\nJack Hardwick,"Submitted 8 May, 2024; \n originally anno...","[Sound, Machine Learning, Audio and Speech Pro..."
4,Cellular Traffic Prediction Using Online Predi...,The advent of 5G technology promises a paradig...,"Authors:\nHossein Mehri, \n \n Hao C...","Submitted 8 May, 2024; \n originally anno...","[Systems and Control, Machine Learning]"
...,...,...,...,...,...
495,Model Predictive Guidance for Fuel-Optimal Lan...,This paper introduces a landing guidance strat...,"Authors:\nKi-Wook Jung, \n \n Sang-D...","Submitted 2 May, 2024; \n originally anno...",[Systems and Control]
496,Causal Influence in Federated Edge Inference,"In this paper, we consider a setting where het...","Authors:\nMert Kayaalp, \n \n Yunus ...","Submitted 2 May, 2024; \n originally anno...","[Machine Learning, Multiagent Systems, Signal ..."
497,Towards Consistent Object Detection via LiDAR-...,As human-machine interaction continues to evol...,"Authors:\nKai Luo, \n \n Hao Wu, \n ...","Submitted 2 May, 2024; \n originally anno...","[Computer Vision and Pattern Recognition, Robo..."
498,Prompt engineering paradigms for medical appli...,Prompt engineering is crucial for harnessing t...,"Authors:\nJamil Zaghir, \n \n Marco ...","Submitted 2 May, 2024; \n originally anno...","[Computation and Language, Machine Learning]"


In [113]:
def scrape_ieee(query, num_pages):
    url = "https://ieeexplore.ieee.org/rest/search"
    headers = {
        "accept": "application/json, text/plain, */*",
        "accept-language": "th-TH,th;q=0.9",
        "content-type": "application/json",
        "origin": "https://ieeexplore.ieee.org",
        "priority": "u=1, i",
        "referer": f"https://ieeexplore.ieee.org/search/searchresult.jsp?newsearch=true&queryText={query}",
        "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "x-security-request": "required"
    }

    all_records = []
    for page in range(1, num_pages+1):
        data = {
            "newsearch": True,
            "queryText": query,
            "highlight": True,
            "returnFacets": ["ALL"],
            "returnType": "SEARCH",
            "matchPubs": True,
            "pageNumber": page
        }

        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()  # Raise an exception for unsuccessful requests
        
        data = response.json()
        records = data.get('records', [])
        all_records.extend(records)

    # Create DataFrame from all records
    df = pd.json_normalize(all_records)
    return df

# EDIT the search query here
query = "Engineering"

# EDIT pagination here
num_pages = 1

selected_columns = ['authors', 'publicationNumber',
       'publicationDate', 'doi',  'ephemera',
       'articleNumber', 'articleTitle',
        'isNumber', 'pdfSize', 'vj', 'citationCount',
       'htmlLink', 
       'downloadCount',  'abstract', 'articleContentType',
    'highlightedTitle', 'publicationTitle',
       ]
df = scrape_ieee(query, num_pages)
print(df.shape)
df[selected_columns]

(25, 59)


Unnamed: 0,authors,publicationNumber,publicationDate,doi,ephemera,articleNumber,articleTitle,isNumber,pdfSize,vj,citationCount,htmlLink,downloadCount,displayPublicationTitle,abstract,articleContentType,highlightedTitle,publicationTitle
0,"[{'preferredName': 'Adel Alblawi', 'normalized...",8718248,8-11 April 2019,10.1109/EDUCON.2019.8725097,False,8725097,A System Engineering Approach in Orienting Tra...,8725024,673,False,4,/document/8725097/,837,2019 IEEE Global [::Engineering::] Education C...,The present work presents a system [::engineer...,Conferences,A System [::Engineering::] Approach in Orienti...,2019 IEEE Global [::Engineering::] Education C...
1,"[{'preferredName': 'Inês Direito', 'normalized...",8564422,27-29 June 2018,10.1109/CISPEE.2018.8593453,False,8593453,Portuguese academic staff and students in UK’s...,8593363,202,False,0,/document/8593453/,97,2018 3rd International Conference of the Portu...,"Before the EU referendum in June 2016, the pro...",Conferences,Portuguese academic staff and students in UK’s...,2018 3rd International Conference of the Portu...
2,[{'preferredName': 'College of Fellows America...,10,July 2013,10.1109/TBME.2013.2264829,False,6519290,Medical and Biological Engineering in the Next...,6532373,359,False,10,/document/6519290/,6755,IEEE Transactions on Biomedical [::Engineering::],"In 2011, the American Institute for Medical an...",Journals,Medical and Biological [::Engineering::] in th...,IEEE Transactions on Biomedical [::Engineering::]
3,"[{'preferredName': 'Mohammad Nawab', 'normaliz...",9121928,27-30 April 2020,10.1109/EDUCON45650.2020.9125119,False,9125119,Bridging the gaps in engineering curriculum th...,9125085,186,False,0,/document/9125119/,166,2020 IEEE Global [::Engineering::] Education C...,This paper presents a system [::engineering::]...,Conferences,Bridging the gaps in [::engineering::] curricu...,2020 IEEE Global [::Engineering::] Education C...
4,"[{'preferredName': 'Linda Laird', 'normalizedN...",7474421,5-6 April 2016,10.1109/CSEET.2016.13,False,7474475,"Strengthening the ""Engineering"" in Software En...",7474423,138,False,10,/document/7474475/,2385,2016 IEEE 29th International Conference on Sof...,"In the fall of 2015, Stevens Institute of Tech...",Conferences,"Strengthening the ""[::Engineering::]"" in Softw...",2016 IEEE 29th International Conference on Sof...
5,"[{'preferredName': 'Paula O. V. Henry', 'norma...",9657196,15-18 Nov. 2021,10.1109/WEEF/GEDC53299.2021.9657437,False,9657437,Evaluation of Engineering Ethics in the Mechan...,9657159,629,False,0,/document/9657437/,171,2021 World [::Engineering::] Education Forum/G...,"In recent times, concerns have been raised abo...",Conferences,Evaluation of [::Engineering::] Ethics in the ...,2021 World [::Engineering::] Education Forum/G...
6,"[{'preferredName': 'Hamid GholamHosseini', 'no...",6636319,26-29 Aug. 2013,10.1109/TALE.2013.6654424,False,6654424,Biotronic Engineering curriculum design: Integ...,6654377,173,False,0,/document/6654424/,107,Proceedings of 2013 IEEE International Confere...,A specialised major in Biotronic [::Engineerin...,Conferences,Biotronic [::Engineering::] curriculum design:...,Proceedings of 2013 IEEE International Confere...
7,[{'preferredName': 'Fathiyah Mohd Kamaruzaman'...,8454953,13-16 Nov. 2017,10.1109/WEEF.2017.8467167,False,8467167,A review on issues and challenges in incorpora...,8466964,223,False,3,/document/8467167/,150,2017 7th World [::Engineering::] Education For...,"In the 21st Century, there is an increasing de...",Conferences,A review on issues and challenges in incorpora...,2017 7th World [::Engineering::] Education For...
8,"[{'preferredName': 'Sandra Daniela Cirimelo', ...",9429076,14-17 March 2021,10.1109/EDUNINE51952.2021.9429123,False,9429123,Plenary: Map of Generic Competences in Enginee...,9429088,535,False,0,/document/9429123/,37,2021 IEEE World Conference on [::Engineering::...,The Plenary Agreement of the Council of Argent...,Conferences,Plenary: Map of Generic Competences in [::Engi...,2021 IEEE World Conference on [::Engineering::...
9,"[{'preferredName': 'Cassie Wallwey', 'normaliz...",13,April 2024,10.1109/TE.2024.3359534,False,10453596,Engineering Identity and Smartness Identity as...,10495361,1591,False,0,/document/10453596/,129,IEEE Transactions on Education,Contribution: This study examined the role of ...,Journals,[::Engineering::] Identity and Smartness Ident...,IEEE Transactions on Education


In [99]:
df.columns

Index(['authors', 'patentCitationCount', 'publicationNumber',
       'publicationDate', 'doi', 'publicationYear', 'documentLink', 'ephemera',
       'articleNumber', 'startPage', 'endPage', 'articleTitle',
       'publicationLink', 'isNumber', 'pdfSize', 'vj', 'citationCount',
       'htmlLink', 'showDataset', 'rightslinkFlag', 'showAlgorithm',
       'downloadCount', 'citationsLink', 'showHtml', 'showVideo', 'publisher',
       'showCheckbox', 'handleProduct', 'redline', 'contentType',
       'displayPublicationTitle', 'abstract', 'articleContentType',
       'isImmersiveArticle', 'pdfLink', 'highlightedTitle', 'publicationTitle',
       'isConference', 'isJournalAndMagazine', 'docIdentifier', 'isBook',
       'isStandard', 'isJournal', 'isBookWithoutChapters', 'course',
       'isEarlyAccess', 'displayContentType', 'isMagazine', 'accessType.type',
       'accessType.message', 'rightsLink', 'issue', 'volume', 'multiMediaLink',
       'graphicalAbstract.summary', 'graphicalAbstract.typ

In [102]:
df['authors']

0      [{'preferredName': 'Adel Alblawi', 'normalized...
1      [{'preferredName': 'Inês Direito', 'normalized...
2      [{'preferredName': 'College of Fellows America...
3      [{'preferredName': 'Mohammad Nawab', 'normaliz...
4      [{'preferredName': 'Linda Laird', 'normalizedN...
                             ...                        
495    [{'preferredName': 'Solane Duque Basister', 'n...
496    [{'preferredName': 'Monica E. Cardella', 'norm...
497    [{'preferredName': 'Cornelius Ncube', 'normali...
498    [{'preferredName': 'Aidan O'Dwyer', 'normalize...
499    [{'preferredName': 'Jessica Menold', 'normaliz...
Name: authors, Length: 500, dtype: object