In [25]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import json
import time
import random

In [26]:
def crawl_ieee_paper_authors(essayCode):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    url = f'https://ieeexplore.ieee.org/document/{essayCode}/authors#authors'
    response = requests.get(url, headers=headers)


    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        author_tags = soup.find_all('author')

        authors = [author.text.strip() for author in author_tags]

        pattern = re.compile(r'authorNames":"(.*)\"')
        matches = pattern.findall(str(soup))

        if matches:
            authors = [match.strip() for match in matches]
            authors = authors[0].split(",")[0]
            authors = authors.replace('\"',"")
        else:
            print("未找到 authorNames")


    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
    
    return authors


In [27]:
def crawl_ieee_reference(essayCode):
    headers = {
        'authority': 'ieeexplore.ieee.org',
        'method': 'GET',
        'path': f'/rest/document/{essayCode}/references',
        'scheme': 'https',
        'Accept': 'application/json, text/plain, */*',
        'Accept-Encoding': 'gzip, deflate, br, zstd',
        'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
        'Cache-Http-Response': 'true',
        'Referer': f'https://ieeexplore.ieee.org/document/{essayCode}/references',
        'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"Windows"',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        'X-Security-Request': 'required'
    }
    url = f'https://ieeexplore.ieee.org/rest/document/{essayCode}/references'

    response = requests.get(url, headers=headers)
    result_dict = {}

    if response.status_code == 200:
        data = response.json()
        for ref in range(len(data["references"])):
            text = data["references"][ref]["text"]
            authors_ls = []

            authors_match = re.search(r'([^,]+),([^"]+)"', text)
            if authors_match:
                authors = authors_match.group(0).strip(' ,').replace('"',"")
                authors_commma = authors.split(",")
                for i in authors_commma:
                    if i :
                        tmp = i.replace("et al.", "")
                        authors_ls.extend(tmp.split("and"))

            title_match = re.search(r'"([^"]+)"', text)
            if title_match:
                title = title_match.group(1)
                authors_ls = [i.strip() for i in authors_ls if i != ' ']
                cleaned_title = re.sub(r'\[[^\]]+\]', '', title)
                result_dict[cleaned_title]=authors_ls
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        data = None

    return result_dict


In [28]:
def getessaylist(queryText, pageNumber):
    url = "https://ieeexplore.ieee.org/rest/search"

    headers = {
        'Accept': 'application/json, text/plain, */*',
        'Accept-Encoding': 'gzip, deflate, br, zstd',
        'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
        'Content-Type': 'application/json',
        'Origin': 'https://ieeexplore.ieee.org',
        'Referer': 'https://ieeexplore.ieee.org/search/searchresult.jsp?newsearch=true&queryText=LLM',
        'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"Windows"',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        'X-Security-Request': 'required'
    }


    payload = {
        'newsearch': True,
        'queryText': queryText,
        'highlight': True,
        'returnFacets': ['ALL'],
        'returnType': 'SEARCH',
        'pageNumber' : str(pageNumber)
    }

    data = json.dumps(payload, ensure_ascii=False).encode('utf-8')

    response = requests.post(url, headers=headers, data=data)

    response_text = response.content.decode('utf-8')

    response_dict = json.loads(response_text)
    result_dict = {}
    for record in range(len(response_dict["records"])):
        result_dict[response_dict["records"][record]["articleTitle"]] = response_dict["records"][record]["articleNumber"]

    return result_dict


In [29]:
data_list= []
def main(SearhKeyWord, MaxPage):
    global data_list
    for num in range(1,MaxPage):
        print(f"***********************   Page  {num}   ***********************************")
        essay = getessaylist(SearhKeyWord, num)
        for title, essayCode in essay.items():
            print("-------------------------------------------------------")
            print("essayCode : ", essayCode)
            time.sleep(random.randint(5, 10))
            try:
                references = crawl_ieee_reference(essayCode)
                authors = crawl_ieee_paper_authors(essayCode)
                print("title : ", title)
                print("reference : ", references)
                print("authors : ", authors)
                data_list.append({
                    'title': title,
                    'essayCode': essayCode,
                    'reference': json.dumps(references),
                    'author': authors
                })
            except:
                print("failed")
    

In [30]:
main("social network", 11)
df = pd.DataFrame(data_list)
df.to_csv("social_network_essay_data_V2.csv")

***********************   Page  1   ***********************************
-------------------------------------------------------
essayCode :  6921601
title :  Social networks analysis and perception of social support of young mothers in the process of reconstruction: The social fabric of winter victims in the Colombian Caribbean (South America)
reference :  {'The Political Network in Mexico: Between Conflict and Stability': ['Schmidt', 'S. y Gil', 'J. (1997)']}
authors :  Carolina S. Castro;Camilo A. Madariaga
-------------------------------------------------------
essayCode :  6113161
title :  Connecting People in the Workplace through Ephemeral Social Networks
reference :  {'Live Social Semantics': ['H. Alani', 'M. Szomszor', 'C. Cattuto', 'W. Van den Broeck', 'G. Correndo', 'A. Barrat'], 'Find me if you can: improving geographical prediction with social and spatial proximity': ['L. Backstrom', 'E. Sun', 'C. Marlow'], 'From awareness to repartee: sharing location within social groups'