In [13]:
from datetime import timedelta
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

In [14]:
country_mapper = pd.read_csv('https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv')
country_mapper['name'] = country_mapper['name'].str.upper()
country_mapper = country_mapper[["name", "alpha-3"]]
country_dict = country_mapper.set_index('name')['alpha-3'].to_dict()

REST API Scraping for IEEE

In [15]:
memoization = dict()

In [46]:
all_records = []

def scrape_ieee(query, num_pages):
    url = "https://ieeexplore.ieee.org/rest/search"
    headers = {
        "accept": "application/json, text/plain, */*",
        "accept-language": "th-TH,th;q=0.9",
        "content-type": "application/json",
        "origin": "https://ieeexplore.ieee.org",
        "priority": "u=1, i",
        "referer": f"https://ieeexplore.ieee.org/search/searchresult.jsp?newsearch=true&queryText={query}",
        "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "x-security-request": "required"
    }

    for page in range(1, num_pages+1):
        data = {
            "newsearch": True,
            "queryText": query,
            "highlight": True,
            "returnFacets": ["ALL"],
            "returnType": "SEARCH",
            "matchPubs": True,
            "pageNumber": page
        }

        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()  # Raise an exception for unsuccessful requests
        
        data = response.json()
        records = data.get('records', [])
        all_records.extend(records)

    # Create DataFrame from all records
    df = pd.json_normalize(all_records)
    return df

# EDIT the search query here
queries = ["Medical", "Engineering", "Biochemical"]

# EDIT pagination here
num_pages = 20
selected_columns = ['authors', 'publicationNumber', 'publicationDate', 'articleNumber',
                    'articleTitle', 'downloadCount',  'abstract', 'articleContentType']
for query in queries :
    df = scrape_ieee(query, num_pages)

selected_df = df[selected_columns]

def get_country(data, authorId):
    global memoization
    if not len(data):
        memoization[authorId] = None
        return None
    currentAffilations = data[0].get('currentAffiliations', [])
    if not len(currentAffilations):
        memoization[authorId] = None
        return None
    currentAffilations = currentAffilations[0]
    country = currentAffilations.split(', ')
    if not len(country):
        memoization[authorId] = None
        return None
    country = country[-1]
    for k in country_dict.keys():
        if country.upper() in k or k in country.upper():
            memoization[authorId] = country_dict[k]
            return country_dict[k]
    if country.upper() in country_dict.values():
        memoization[authorId] = country.upper()
        return country.upper()
    memoization[authorId] = None
    return None

def scrape_each_author(author):
    if('id' not in author) :
        return None
    authorId = author['id']
    global memoization
    if authorId in memoization.keys():
        return memoization[authorId]

    url = f"https://ieeexplore.ieee.org/rest/author/{authorId}"
    headers = {
        "accept": "application/json, text/plain, */*",
        "accept-language": "th-TH,th;q=0.9",
        "content-type": "application/json",
        "origin": "https://ieeexplore.ieee.org",
        "priority": "u=1, i",
        "referer": "https://ieeexplore.ieee.org/search/searchresult.jsp?newsearch=true&queryText=Engineering",
        "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"
    }

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        memoization[authorId] = None
        return None
    response.raise_for_status()  # Raise an exception for unsuccessful requests

    data = response.json()
    country = get_country(data, authorId)
    return country
    

selected_df['authors']

# selected_df['authorsName'] = selected_df['authors'].apply(lambda x: [author['preferredName'] for author in x] if type(x) == list else [])
selected_df['authorsAffilationCountry'] = selected_df['authors'].apply(lambda x: [scrape_each_author(author) for author in x] if type(x) == list else [])
# # selected_df = selected_df.drop(columns=['authors'], axis=1)
# print(selected_df.shape)
selected_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df['authorsAffilationCountry'] = selected_df['authors'].apply(lambda x: [scrape_each_author(author) for author in x] if type(x) == list else [])


Unnamed: 0,authors,publicationNumber,publicationDate,articleNumber,articleTitle,downloadCount,abstract,articleContentType,authorsAffilationCountry
0,"[{'preferredName': 'Jeong-Woo Sohn', 'normaliz...",7359637,13-16 Oct. 2015,7364597,Medical device industry and the medical cluste...,418,[::Medical::] device industry has its own feat...,Conferences,"[PRK, PRK, PRK]"
1,"[{'preferredName': 'Soonhyun Kwon', 'normalize...",9680743,23-25 Aug. 2021,9680753,Automatic Stroke Medical Ontology Augmentation...,365,The need for [::medical::] ontology to provide...,Conferences,"[None, None, None, None, None]"
2,"[{'preferredName': 'Hongpu Hu', 'normalizedNam...",7794486,19-21 Nov. 2016,7811129,The state-of-the-art of electronic medical doc...,117,[::Medical::] electronic documents management ...,Conferences,"[CHN, CHN, CHN, CHN, CHN]"
3,"[{'preferredName': 'Yue Cao', 'normalizedName'...",10248021,21-23 April 2023,10248105,Application and Supervision of 5G in Medical D...,227,With the popularization of 5G(5th Generation M...,Conferences,"[CHN, CHN, CHN, CHN, CHN, CHN, None]"
4,,8472334,19 Sept. 2018,8472336,ISO/IEEE International Standard for Health inf...,542,Within the context of the ISO/IEEE 11073 famil...,Standards,[]
...,...,...,...,...,...,...,...,...,...
1495,"[{'preferredName': 'Cody Park', 'normalizedNam...",19,2022,9829837,Velocity Profiling of a Gas–Solid Fluidized Be...,261,"In this work, a method of producing velocity p...",Journals,"[USA, USA, USA, USA, USA, USA, USA]"
1496,"[{'preferredName': 'Viet Duc Phung', 'normaliz...",8232889,19-22 Nov. 2017,8244551,Au nanostructures electrodeposited on graphene...,99,Surface enhanced Raman scattering (SERS) is on...,Conferences,"[None, None, None, None]"
1497,"[{'preferredName': 'Nor Alafiza Yunus', 'norma...",5771302,19-21 April 2011,5775572,Design of tailor-made chemical blend using a d...,141,Computer aided techniques form an efficient ap...,Conferences,"[DNK, DNK, MYS, DNK, DNK]"
1498,"[{'preferredName': 'Christian Schoppmeyer', 'n...",6387515,3-5 Oct. 2012,6402419,Timed automata based scheduling for a miniatur...,99,In this contribution we present a conceptual i...,Conferences,"[DEU, DEU, DEU, DEU]"


In [47]:
finished_df = selected_df
finished_df['extracted_class'] = ''

finished_df.loc[:499, 'extracted_class'] = 'MEDI'
finished_df.loc[500:999, 'extracted_class'] = 'ENGI'
finished_df.loc[1000:1499, 'extracted_class'] = 'BIOC'

finished_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  finished_df['extracted_class'] = ''


Unnamed: 0,authors,publicationNumber,publicationDate,articleNumber,articleTitle,downloadCount,abstract,articleContentType,authorsAffilationCountry,extracted_class
0,"[{'preferredName': 'Jeong-Woo Sohn', 'normaliz...",7359637,13-16 Oct. 2015,7364597,Medical device industry and the medical cluste...,418,[::Medical::] device industry has its own feat...,Conferences,"[PRK, PRK, PRK]",MEDI
1,"[{'preferredName': 'Soonhyun Kwon', 'normalize...",9680743,23-25 Aug. 2021,9680753,Automatic Stroke Medical Ontology Augmentation...,365,The need for [::medical::] ontology to provide...,Conferences,"[None, None, None, None, None]",MEDI
2,"[{'preferredName': 'Hongpu Hu', 'normalizedNam...",7794486,19-21 Nov. 2016,7811129,The state-of-the-art of electronic medical doc...,117,[::Medical::] electronic documents management ...,Conferences,"[CHN, CHN, CHN, CHN, CHN]",MEDI
3,"[{'preferredName': 'Yue Cao', 'normalizedName'...",10248021,21-23 April 2023,10248105,Application and Supervision of 5G in Medical D...,227,With the popularization of 5G(5th Generation M...,Conferences,"[CHN, CHN, CHN, CHN, CHN, CHN, None]",MEDI
4,,8472334,19 Sept. 2018,8472336,ISO/IEEE International Standard for Health inf...,542,Within the context of the ISO/IEEE 11073 famil...,Standards,[],MEDI
...,...,...,...,...,...,...,...,...,...,...
1495,"[{'preferredName': 'Cody Park', 'normalizedNam...",19,2022,9829837,Velocity Profiling of a Gas–Solid Fluidized Be...,261,"In this work, a method of producing velocity p...",Journals,"[USA, USA, USA, USA, USA, USA, USA]",BIOC
1496,"[{'preferredName': 'Viet Duc Phung', 'normaliz...",8232889,19-22 Nov. 2017,8244551,Au nanostructures electrodeposited on graphene...,99,Surface enhanced Raman scattering (SERS) is on...,Conferences,"[None, None, None, None]",BIOC
1497,"[{'preferredName': 'Nor Alafiza Yunus', 'norma...",5771302,19-21 April 2011,5775572,Design of tailor-made chemical blend using a d...,141,Computer aided techniques form an efficient ap...,Conferences,"[DNK, DNK, MYS, DNK, DNK]",BIOC
1498,"[{'preferredName': 'Christian Schoppmeyer', 'n...",6387515,3-5 Oct. 2012,6402419,Timed automata based scheduling for a miniatur...,99,In this contribution we present a conceptual i...,Conferences,"[DEU, DEU, DEU, DEU]",BIOC


In [48]:
finished_df.to_csv('./scraped_data.csv')