In [1]:
from bs4 import BeautifulSoup

import requests, json, re

import pandas as pd

from concurrent.futures import ThreadPoolExecutor

import time

In [2]:
def get_soup(article_data : dict) -> BeautifulSoup:

    doc_id          = article_data.get('articleNumber')

    url             = 'https://ieeexplore.ieee.org/document/' + doc_id

    headers         = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; rv:124.0) Gecko/20100101 Firefox/124.0'}

    req             = requests.get(url, headers=headers)

    soup            = BeautifulSoup(req.text, "lxml")

    return soup


def get_abstract(soup : BeautifulSoup) -> str:

    abs_meta    = soup.select_one('meta[property="og:description"]')

    abstract    = abs_meta['content'] if abs_meta else " "

    return abstract


def get_keywords(soup : BeautifulSoup, index : int) -> list:

    s               = re.findall(r"xplGlobal\.document\.metadata=(.*?)};", str(soup.select("script")))

    if s:

        s           = s[0]

        meta_dict   = json.loads(s +  "}")

        try:
            ieee_keywords       = meta_dict['keywords'][0]['kwd']  # index 0 = IEEE Keywords, 1 = Index Terms, 2 = Author keywords (not always available)
        except:
            ieee_keywords       = []


        
        try:
            author_keywords     = meta_dict['keywords'][2]['kwd']# index 0 = IEEE Keywords, 1 = Index Terms, 2 = Author keywords (not always available)
        except:
            author_keywords     = []

        kw_lower    = [keyword.lower() for keyword in ieee_keywords + author_keywords]

    else:

        #print(soup)
        
        kw_lower    = []

    return kw_lower

In [3]:
url             = 'https://ieeexplore.ieee.org/rest/search'

headers         = { "User-Agent":'Mozilla/5.0 (Windows NT 10.0; rv:124.0) Gecko/20100101 Firefox/124.0',
                    "Referer": "https://ieeexplore.ieee.org/search/searchresult.jsp?action=search&newsearch=true&matchBoolean=true"}

In [7]:
payload_mixed  = {"action":"search","newsearch":True,"matchBoolean":True,"queryText":"((\"Publication Number\":63 \nOR \"Publication Number\":8782709 \nOR \"Parent Publication Number\":1842944 \nOR \"Parent Publication Number\":1000047 \nOR \"Parent Publication Number\":1002943 \nOR \"Parent Publication Number\":1001483 \nOR \"Parent Publication Number\":1000158\n)\nAND\n(\n \"Abstract\":\"design automation\"\nOR  \"Abstract\":\"automated\"\nOR \"Abstract\":\"optimization algorithm\"\nOR \"Abstract\":\"optimization technique\"\nOR \"Abstract\":\"design optimization\"\nOR \"Abstract\":\"optimizer\"\nOR \"Abstract\":\"design strategy\"\nOR \"Abstract\":\"design procedure\"\nOR \"Abstract\":\"design method\"\nOR \"Abstract\":\"design methodology\"\nOR \"Abstract\":\"design tool\"\nOR \"Abstract\":\"design flow\"\nOR \"Abstract\":\"design space\"\nOR \"Abstract\":\"script\"\nOR \"Abstract\":\"synthesis\"\nOR \"Abstract\":\"algorithm\"\nOR \"Abstract\":\"computer-aided\"\n))"
,"highlight":True,"returnFacets":["ALL"],"returnType":"SEARCH","matchPubs":True,"rowsPerPage": "100"}


In [9]:
def get_all_info(article : dict) -> dict:

    if article:

        soup        = get_soup(article)

        #print(soup.title.string)

        if soup.title.string == "Request Rejected":

            print("cats")

            #time.sleep(30)

            return get_all_info(article)
        
        else:

            keywords    = get_keywords(soup, index = 0)

            abstract    = get_abstract(soup)

            result      = { 'conference'   : article['publicationTitle']
                        ,   'year'         : article['publicationYear']
                        ,   'title'        : article['articleTitle']
                        ,   'citations'    : article['citationCount']
                        ,   'keywords'     : keywords
                        ,   'abstract'     : abstract
                        }
    else:

        result      = {}

    return result


def scrape(payload : dict, num_workers : int) -> pd.DataFrame:

    #print(f"Payload Number: {i}")

    all_pages                       = []

    payload['pageNumber']           = 1

    while True:

        req                         = requests.post(url, headers=headers, json=payload)

        #print(req)

        query_result                = req.json()['records'] if 'records' in req.json() else [{}]

        if len(query_result) == 1:
            break
        else:
            all_pages               = all_pages + query_result
            payload['pageNumber']   = payload['pageNumber'] + 1

    print(f"Number of articles: {len(all_pages)}.")

    #print(f"No. of articles in that payload: {len(all_pages)}")

    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(get_all_info, article) for article in all_pages]


    return pd.DataFrame([future.result() for future in futures])

In [10]:
# result_df       = pd.DataFrame()

# for j, payload in enumerate(payloads):

#     print(f"Payload No. {j}")

#     df_venue    = scrape(payload, num_workers=100)

#     filename    = f"df_{j}.pkl"

#     df_venue.to_pickle(filename)

#     #result_df   = pd.concat([result_df, df_venue])

The IEEE Xplore site might be not responding and a JSON Decode Error occurs. Do one payload at a time and give cooldown time.

In [11]:
df_venue    = scrape(payload_mixed, num_workers=3) # Do not use more than 2 workers, otherwise IEEE Xplore becomes unresponsive

filename    = "df_mixed.pkl"

df_venue.to_pickle(filename)

Number of articles: 6532.


In [14]:
a = pd.read_pickle('df_mixed.pkl')

In [12]:
# url             = 'https://ieeexplore.ieee.org/document/' + '10509331'

# headers         = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; rv:124.0) Gecko/20100101 Firefox/124.0'}

# req             = requests.get(url, headers=headers)

# soup            = BeautifulSoup(req.text, "lxml")

# s               = re.findall(r"xplGlobal\.document\.metadata=(.*?)};", str(soup.select("script")))

# if s:

#     s           = s[0]

#     meta_dict   = json.loads(s +  "}")

#     try:
#         ieee_keywords       = meta_dict['keywords'][0]['kwd']  # index 0 = IEEE Keywords, 1 = Index Terms, 2 = Author keywords (not always available)
#     except:
#         ieee_keywords       = []


    
#     try:
#         author_keywords     = meta_dict['keywords'][2]['kwd']# index 0 = IEEE Keywords, 1 = Index Terms, 2 = Author keywords (not always available)
#     except:
#         author_keywords     = []

# print(author_keywords, ieee_keywords)