In [5]:
from datetime import timedelta
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

In [6]:
country_mapper = pd.read_csv('https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv')
country_mapper['name'] = country_mapper['name'].str.upper()
country_mapper = country_mapper[["name", "alpha-3"]]
country_dict = country_mapper.set_index('name')['alpha-3'].to_dict()

REST API Scraping for IEEE

In [7]:
memoization = dict()

In [9]:
def scrape_ieee(query, num_pages):
    url = "https://ieeexplore.ieee.org/rest/search"
    headers = {
        "accept": "application/json, text/plain, */*",
        "accept-language": "th-TH,th;q=0.9",
        "content-type": "application/json",
        "origin": "https://ieeexplore.ieee.org",
        "priority": "u=1, i",
        "referer": f"https://ieeexplore.ieee.org/search/searchresult.jsp?newsearch=true&queryText={query}",
        "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "x-security-request": "required"
    }

    all_records = []
    for page in range(1, num_pages+1):
        data = {
            "newsearch": True,
            "queryText": query,
            "highlight": True,
            "returnFacets": ["ALL"],
            "returnType": "SEARCH",
            "matchPubs": True,
            "pageNumber": page
        }

        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()  # Raise an exception for unsuccessful requests
        
        data = response.json()
        records = data.get('records', [])
        all_records.extend(records)

    # Create DataFrame from all records
    df = pd.json_normalize(all_records)
    return df

# EDIT the search query here
query = "Engineering"

# EDIT pagination here
num_pages = 1

selected_columns = ['authors', 'publicationNumber', 'publicationDate', 'articleNumber',
                    'articleTitle', 'downloadCount',  'abstract', 'articleContentType']
df = scrape_ieee(query, num_pages)
selected_df = df[selected_columns]

def scrape_each_author(authorId):

    global memoization
    if authorId in memoization.keys():
        return memoization[authorId]

    url = f"https://ieeexplore.ieee.org/rest/author/{authorId}"
    headers = {
        "accept": "application/json, text/plain, */*",
        "accept-language": "th-TH,th;q=0.9",
        "content-type": "application/json",
        "origin": "https://ieeexplore.ieee.org",
        "priority": "u=1, i",
        "referer": "https://ieeexplore.ieee.org/search/searchresult.jsp?newsearch=true&queryText=Engineering",
        "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"
    }

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        memoization[authorId] = None
        return None
    response.raise_for_status()  # Raise an exception for unsuccessful requests

    data = response.json()
    if not len(data):
        memoization[authorId] = None
        return None
    currentAffilations = data[0].get('currentAffiliations', [])
    if not len(currentAffilations):
        memoization[authorId] = None
        return None
    currentAffilations = currentAffilations[0]
    country = currentAffilations.split(', ')
    if not len(country):
        memoization[authorId] = None
        return None
    country = country[-1]
    for k in country_dict.keys():
        if country.upper() in k or k in country.upper():
            memoization[authorId] = country_dict[k]
            return country_dict[k]
    if country.upper() in country_dict.values():
        memoization[authorId] = country.upper()
        return country.upper()
    memoization[authorId] = None
    return None

selected_df['authorsName'] = selected_df['authors'].apply(lambda x: [author['preferredName'] for author in x] if type(x) == list else [])
selected_df['authorsAffilationCountry'] = selected_df['authors'].apply(lambda x: [scrape_each_author(author['id']) for author in x])
selected_df = selected_df.drop(columns=['authors'], axis=1)
print(selected_df.shape)
selected_df

(25, 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df['authorsName'] = selected_df['authors'].apply(lambda x: [author['preferredName'] for author in x] if type(x) == list else [])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df['authorsAffilationCountry'] = selected_df['authors'].apply(lambda x: [scrape_each_author(author['id']) for author in x])


Unnamed: 0,publicationNumber,publicationDate,articleNumber,articleTitle,downloadCount,abstract,articleContentType,authorsName,authorsAffilationCountry
0,8718248,8-11 April 2019,8725097,A System Engineering Approach in Orienting Tra...,837,The present work presents a system [::engineer...,Conferences,"[Adel Alblawi, Mohammad Nawab, Abdulaziz Alsay...","[SAU, SAU, SAU]"
1,8564422,27-29 June 2018,8593453,Portuguese academic staff and students in UK’s...,97,"Before the EU referendum in June 2016, the pro...",Conferences,"[Inês Direito, Stella Fowler]","[PRT, GBR]"
2,10,July 2013,6519290,Medical and Biological Engineering in the Next...,6755,"In 2011, the American Institute for Medical an...",Journals,[College of Fellows American Institute for Med...,[USA]
3,9121928,27-30 April 2020,9125119,Bridging the gaps in engineering curriculum th...,166,This paper presents a system [::engineering::]...,Conferences,"[Mohammad Nawab, Adel Alblawi, Abdulaziz Alsay...","[SAU, SAU, SAU, SAU]"
4,7474421,5-6 April 2016,7474475,"Strengthening the ""Engineering"" in Software En...",2385,"In the fall of 2015, Stevens Institute of Tech...",Conferences,[Linda Laird],[USA]
5,9657196,15-18 Nov. 2021,9657437,Evaluation of Engineering Ethics in the Mechan...,171,"In recent times, concerns have been raised abo...",Conferences,"[Paula O. V. Henry, Earle A. Wilson, Trevor G....","[JAM, JAM, JAM]"
6,6636319,26-29 Aug. 2013,6654424,Biotronic Engineering curriculum design: Integ...,107,A specialised major in Biotronic [::Engineerin...,Conferences,"[Hamid GholamHosseini, Krishnamachar Prasad]","[NZL, NZL]"
7,8454953,13-16 Nov. 2017,8467167,A review on issues and challenges in incorpora...,150,"In the 21st Century, there is an increasing de...",Conferences,"[Fathiyah Mohd Kamaruzaman, Roszilah Hamid, Az...","[MYS, MYS, MYS]"
8,9429076,14-17 March 2021,9429123,Plenary: Map of Generic Competences in Enginee...,37,The Plenary Agreement of the Council of Argent...,Conferences,"[Sandra Daniela Cirimelo, Mónica Pascual, Robe...","[ARG, ARG, ARG, ARG]"
9,13,April 2024,10453596,Engineering Identity and Smartness Identity as...,129,Contribution: This study examined the role of ...,Journals,"[Cassie Wallwey, Emily Dringenberg, Bailey Bra...","[USA, USA, USA, USA, USA]"
