In [1]:
import json
import requests
from bs4 import BeautifulSoup

In [2]:
# 获取 CHI2020 会议的所有网址，因为 CHI2020 是动态网页，每个网址包含从论文 id 起往后的 30 篇论文
base_url_body = "https://dl.acm.org/doi/proceedings/10.1145/3313831?id="
base_url_id_list = list(range(1, 752, 30))  # 根据预先尝试的数值，id=751 是最后一个含有论文的网址
n = len(base_url_id_list)

base_url_list = []  # 创建一个空列表以存放 CHI2020 会议的所有网址

for i in range(n):
    base_url_list.append(base_url_body + str(base_url_id_list[i]))

In [3]:
base_url_list  # 输出会议网址列表进行检查

['https://dl.acm.org/doi/proceedings/10.1145/3313831?id=1',
 'https://dl.acm.org/doi/proceedings/10.1145/3313831?id=31',
 'https://dl.acm.org/doi/proceedings/10.1145/3313831?id=61',
 'https://dl.acm.org/doi/proceedings/10.1145/3313831?id=91',
 'https://dl.acm.org/doi/proceedings/10.1145/3313831?id=121',
 'https://dl.acm.org/doi/proceedings/10.1145/3313831?id=151',
 'https://dl.acm.org/doi/proceedings/10.1145/3313831?id=181',
 'https://dl.acm.org/doi/proceedings/10.1145/3313831?id=211',
 'https://dl.acm.org/doi/proceedings/10.1145/3313831?id=241',
 'https://dl.acm.org/doi/proceedings/10.1145/3313831?id=271',
 'https://dl.acm.org/doi/proceedings/10.1145/3313831?id=301',
 'https://dl.acm.org/doi/proceedings/10.1145/3313831?id=331',
 'https://dl.acm.org/doi/proceedings/10.1145/3313831?id=361',
 'https://dl.acm.org/doi/proceedings/10.1145/3313831?id=391',
 'https://dl.acm.org/doi/proceedings/10.1145/3313831?id=421',
 'https://dl.acm.org/doi/proceedings/10.1145/3313831?id=451',
 'https://dl.

In [4]:
# 获取会议单篇论文的所有网址，上述的每个会议网址除了最后一个以外，其余均包含30个单篇论文网址
url_list = []  # 创建一个空列表以存放 CHI2020 会议单篇论文的所有网址

for base_url in base_url_list:
    req = requests.get(base_url)  # 加载网址内容

    soup = BeautifulSoup(req.text, "html.parser")  # 解析网址内容

    # 找到所有单篇论文的网址并储存到列表中
    for url_cell in soup.find_all('div', {'class': 'issue-item__detail'}):
        article_url = url_cell.a['href']  # extract the url
        url_list.append(article_url)  # store in dictionary format

In [5]:
url_list  # 输出论文网址列表进行检查

['https://doi.org/10.1145/3313831.3376128',
 'https://doi.org/10.1145/3313831.3376129',
 'https://doi.org/10.1145/3313831.3376130',
 'https://doi.org/10.1145/3313831.3376131',
 'https://doi.org/10.1145/3313831.3376132',
 'https://doi.org/10.1145/3313831.3376133',
 'https://doi.org/10.1145/3313831.3376134',
 'https://doi.org/10.1145/3313831.3376135',
 'https://doi.org/10.1145/3313831.3376136',
 'https://doi.org/10.1145/3313831.3376137',
 'https://doi.org/10.1145/3313831.3376138',
 'https://doi.org/10.1145/3313831.3376139',
 'https://doi.org/10.1145/3313831.3376140',
 'https://doi.org/10.1145/3313831.3376141',
 'https://doi.org/10.1145/3313831.3376142',
 'https://doi.org/10.1145/3313831.3376143',
 'https://doi.org/10.1145/3313831.3376144',
 'https://doi.org/10.1145/3313831.3376145',
 'https://doi.org/10.1145/3313831.3376146',
 'https://doi.org/10.1145/3313831.3376147',
 'https://doi.org/10.1145/3313831.3376148',
 'https://doi.org/10.1145/3313831.3376149',
 'https://doi.org/10.1145/331383

In [6]:
print(len(url_list))  # 查询论文总数

758


In [7]:
# 单一论文网址，用于测试下列函数
test_url = "https://doi.org/10.1145/3313831.3376128"
test_req = requests.get(test_url)
test_soup = BeautifulSoup(test_req.text, "html.parser")

In [8]:
# 该函数用于获取单篇论文的作者信息列表
def get_authors_list(soup):
    authors_list = []
    authors_part = soup.find('div', {'id': 'sb-1'})

    for author in authors_part.find_all('li', {'class': 'loa__item'}):
        author_id = author.find('div', {'class': "author-info__body"}).a['href'][9:]
        author_name = author.a['title']
        authors_list.append({'id': author_id, 'name': author_name})

    return authors_list

In [9]:
# 测试 get_authors_list 函数
authors_list = get_authors_list(test_soup)
authors_list

[{'id': '99658744130', 'name': 'Josh Andres'},
 {'id': '81311483103', 'name': 'm.c. schraefel'},
 {'id': '99659365958', 'name': 'Nathan Semertzidis'},
 {'id': '99659529317', 'name': 'Brahmi Dwivedi'},
 {'id': '99659527470', 'name': 'Yutika C. Kulwe'},
 {'id': '81100643947', 'name': 'Juerg von Kaenel'},
 {'id': '81100260236', 'name': 'Florian Floyd Mueller'}]

In [29]:
# 该函数用于获取单篇论文的文章信息，内部调用 get_authors_list 函数
def get_article_info(soup, year):
    article_year = year
    article_doi = soup.find('a', {'class': 'issue-item__doi'}).text.strip()[16:]
    article_id = article_doi[16:]
    article_title = soup.find('h1', {'class': 'citation__title'}).text.strip()
    article_authors = get_authors_list(soup)
    article_abstract = soup.find('div', {'class': 'abstractSection abstractInFull'}).text.strip()

    article_info = {'year': article_year,
                    'id': article_id,
                    'title': article_title,
                    'authors': article_authors,
                    'abstract': article_abstract,
                    'doi': article_doi}

    return article_info

In [11]:
# 测试 get_authors_list 函数，并将其写入 article_info.json
article_info_json = get_article_info(test_soup, 2020)

file = open('article_info.json', 'w', encoding='utf-8')
json.dump(article_info_json, file, ensure_ascii=False)
file.close()

In [26]:
# 该函数用于获取单篇论文的参考文献信息
def get_references_info(soup, year):
    references_info = []

    i = 1
    for reference in soup.find_all('span', {'class': 'references__note'}):
        reference_index = i
        reference_citation = reference.text.strip()

        if reference.find('img', {'data-title': 'Digital Library'}):
            digital_library_img = reference.find('img', {'data-title': 'Digital Library'})
            reference_doi = digital_library_img.find_parent('span').a['href'][23:]
            reference_id = reference_doi[16:]
            references_info.append({'ref': reference_index,
                                    'citation': reference_citation,
                                    'id': reference_id,
                                    'doi': reference_doi})
        else:
            references_info.append({'ref': reference_index,
                                    'citation': reference_citation})
        i += 1

    return references_info

In [27]:
# 测试 get_references_info 函数
references_info = get_references_info(test_soup, 2020)
references_info

[{'ref': 1,
  'citation': 'Dzmitry Aliakseyeu, Bernt Meerbeek, Jon Mason, Remco Magielse and Susanne Seitinger. Peripheral Interaction with Light. In Peripheral Interaction, Springer, 2016, 207--235. http://dx.doi.org/10.1145/10.1007/978--3--319--29523--7_10Google ScholarCross Ref'},
 {'ref': 2,
  'citation': 'Andres, et al. 2020. Future Inbodied: A Framework for Inbodied Interaction Design. In Proceedings of TEI Conference on Tangible, Embedded, and Embodied Interaction. http://dx.doi.org/10.1145/10.1145/3374920.3374969Google Scholar'},
 {'ref': 3,
  'citation': 'Josh Andres, Julian de Hoog and Florian \'Floyd\' Mueller. 2018. "I Had Super-Powers When eBike Riding" Towards Understanding the Design of Integrated Exertion. Proceedings of the 2018 Annual Symposium on Computer-Human Interaction in Play. http://dx.doi.org/10.1145/10.1145/3242671.3242688Google ScholarDigital Library',
  'id': '3242688',
  'doi': '10.1145/3242671.3242688'},
 {'ref': 4,
  'citation': 'Josh Andres, Tuomas Kari

In [14]:
# 该函数用于获取所有论文的文章信息，内部调用 get_article_info 函数
def get_conference_json(url_list, year):
    conference_json = []

    for url in url_list:
        req = requests.get(url)
        soup = BeautifulSoup(req.text, "html.parser")
        conference_json.append(get_article_info(soup, year))

    conference_json = json.dumps(conference_json, indent=4)

    return conference_json

In [15]:
# 该函数用于获取所有论文的参考文献信息，内部调用 get_references_info 函数
def get_references_json(url_list, year):
    references_json = {}

    for url in url_list:
        req = requests.get(url)
        soup = BeautifulSoup(req.text, "html.parser")
        article_id = soup.find('a', {'class': 'issue-item__doi'}).text.strip()[-7:]
        references_json[article_id] = get_references_info(soup, year)

    references_json = json.dumps(references_json, indent=4)

    return references_json

In [30]:
# 获取 CHI2020 所有论文的文章信息，并写入 conference-2020.json
conference_json = get_conference_json(url_list, 2020)
file = open('conference-2020.json', 'w', encoding='utf-8')
file.write(conference_json)
file.close()

In [28]:
# 获取 CHI2020 所有论文的参考文献信息，并写入 references-2020.json
references_json = get_references_json(url_list, 2020)
file = open('references-2020.json', 'w', encoding='utf-8')
file.write(references_json)
file.close()