In [1]:
import time
import warnings

from functools import lru_cache

import numpy as np
import pandas as pd
import undetected_chromedriver as uc

from tqdm import tqdm
from bs4 import BeautifulSoup 

from selenium.webdriver.chrome.options import Options

warnings.filterwarnings('ignore')

# Параметры

In [2]:
urls = [
    'https://dl.acm.org/doi/10.1145/2996913.2996996',
    'https://dl.acm.org/doi/10.1145/2487575.2487616',
]

In [3]:
max_depth = 2  # Глубина поиска

# Конфигурация Selenium

In [4]:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.140 Safari/537.36'

options = Options()
# options.add_argument('--headless=new')
options.add_argument(f'user-agent={user_agent}')

In [5]:
driver = uc.Chrome(
    options=options,
    use_subprocess=False
)

# Функции

In [6]:
@lru_cache(maxsize=128, typed=False)
def get_article(url: str) -> dict:

    result = dict()

    driver.get(url)
    time.sleep(2)

    soup = BeautifulSoup(driver.page_source, 'html.parser')

    title = soup.find(name='h1', property='name')

    if title is not None:
        title = title.get_text()
    
    authors = soup.find_all(name='span', property='author')
    authors = [author.find(name='a') for author in authors]
    
    if len(authors) > 0:
        authors = [author.get_text() for author in authors]
    
    source = soup.find(name='div', property='isPartOf')

    if source is not None:
        source = source.get_text()
    
    number_and_pages = soup.find(name='div', **{'data-type': 'acm-number'})

    if number_and_pages is not None:
        number_and_pages = number_and_pages.get_text()
    
    doi = soup.find(name='div', class_='doi')

    if doi is not None:
        doi = doi.get_text()
    
    published = soup.find(name='span', class_='core-date-published')

    if published is not None:
        published = published.get_text()

    citation = soup.find(name='span', class_='citation')

    if citation is not None:
        citation = citation.find(name='span')
        if citation is not None:
            citation = citation.get_text()

    metric = soup.find(name='span', class_='metric')

    if metric is not None:
        metric = metric.find(name='span')
        if metric is not None:
            metric = metric.get_text()
    
    abstract = soup.find(name='section', id='abstract')

    if abstract is not None:
        abstract = abstract.find(name='div', role='paragraph')
        if abstract is not None:
            abstract = abstract.get_text()

    references = soup.find(name='div', role='list', id='collapsible-text')
    
    if references is not None:
        references = [reference.find(name='div', class_='core-xlink-digital-library') for reference in references]
        references = list(filter(lambda x: x is not None, references))
        if len(references) > 0:
            references = [f"https://dl.acm.org{reference.find(name='a').get('href')}" for reference in references]
    else:
        references = []
        
    result['title'] = title
    result['authors'] = authors
    result['source'] = source
    result['number and pages'] = number_and_pages
    result['doi'] = doi
    result['published'] = published
    result['citation'] = citation
    result['metric'] = metric
    result['abstract'] = abstract
    result['references'] = references

    return result

# Поиск

In [7]:
results = dict()

for url in tqdm(urls, total=len(urls), colour='green'):
    
    if url in results:
        continue
    results[url] = dict()
    
    for deapth in tqdm(range(max_depth+1), total=max_depth+1, colour='green'):
        
        results[url][deapth] = list()

        if deapth == 0:
            article = get_article(url)
            results[url][deapth].append(article)
            continue

        links = list()

        for article in results[url][deapth - 1]:
            links.extend(article['references'])

        for link in links:
            article = get_article(link)
            results[url][deapth].append(article)

driver.quit()

  0%|[32m                                                                                            [0m| 0/2 [00:00<?, ?it/s][0m
[A%|[32m                                                                                            [0m| 0/3 [00:00<?, ?it/s][0m
[A%|[32m████████████████████████████                                                        [0m| 1/3 [00:04<00:09,  4.84s/it][0m
[A%|[32m████████████████████████████████████████████████████████                            [0m| 2/3 [01:26<00:50, 50.28s/it][0m
100%|[32m███████████████████████████████████████████████████████████████████████████████████[0m| 3/3 [13:55<00:00, 278.65s/it][0m
 50%|[32m█████████████████████████████████████████▌                                         [0m| 1/2 [13:55<13:55, 835.96s/it][0m
[A%|[32m                                                                                            [0m| 0/3 [00:00<?, ?it/s][0m
[A%|[32m████████████████████████████                               

In [8]:
# results

# Обработка результатов парсинга

In [9]:
total = list()

for url, data in results.items():
    for deapth, articles in data.items():
        local = pd.DataFrame(articles)
        local.insert(0, 'url', url)
        local.insert(1, 'deapth', deapth)
        total.append(local)

total = pd.concat(total)
total.reset_index(drop=True, inplace=True)

In [10]:
total

Unnamed: 0,url,deapth,title,authors,source,number and pages,doi,published,citation,metric,abstract,references
0,https://dl.acm.org/doi/10.1145/2996913.2996996,0,Demand driven store site selection via multipl...,"[Mengwen Xu, Tianyi Wang, Zhengwei Wu, Jingbo ...",SIGSPACIAL '16: Proceedings of the 24th ACM SI...,"Article No.: 40, Pages 1 - 10",https://doi.org/10.1145/2996913.2996996,31 October 2016,26,617,Choosing a good location when opening a new st...,[https://dl.acm.org/doi/10.1016/S0305-0548(01)...
1,https://dl.acm.org/doi/10.1145/2996913.2996996,1,The generalized maximal covering location problem,"[Oded Berman, Dmitry Krass]",Computers and Operations Research,,https://doi.org/10.1016/S0305-0548(01)00079-X,01 May 2002,34,0,We consider a generalization of the maximal co...,[]
2,https://dl.acm.org/doi/10.1145/2996913.2996996,1,Random Forests,[Leo Breiman],Machine Learning,,https://doi.org/10.1023/A:1010933404324,01 October 2001,9828,0,Random forests are a combination of tree predi...,[https://dl.acm.org/doi/10.1162/neco.1997.9.7....
3,https://dl.acm.org/doi/10.1145/2996913.2996996,1,Efficient algorithms for optimal location quer...,"[Zitong Chen, Yubao Liu, Raymond Chi-Wing Wong...",SIGMOD '14: Proceedings of the 2014 ACM SIGMOD...,,https://doi.org/10.1145/2588555.2612172,18 June 2014,47,790,"In this paper, we study the optimal location q...",[https://dl.acm.org/doi/10.14778/2350229.23502...
4,https://dl.acm.org/doi/10.1145/2996913.2996996,1,Mean Shift: A Robust Approach Toward Feature S...,"[Dorin Comaniciu, Peter Meer]",IEEE Transactions on Pattern Analysis and Mach...,,https://doi.org/10.1109/34.1000236,01 May 2002,2062,0,A general nonparametric technique is proposed ...,"[https://dl.acm.org/doi/10.1007/BF00128233, ht..."
...,...,...,...,...,...,...,...,...,...,...,...,...
254,https://dl.acm.org/doi/10.1145/2487575.2487616,2,Geographical topic discovery and comparison,"[Zhijun Yin, Liangliang Cao, Jiawei Han, Cheng...",WWW '11: Proceedings of the 20th international...,,https://doi.org/10.1145/1963405.1963443,28 March 2011,232,1642,This paper studies the problem of discovering ...,"[https://dl.acm.org/doi/10.5555/944919.944937,..."
255,https://dl.acm.org/doi/10.1145/2487575.2487616,2,Driving with knowledge from the physical world,"[Jing Yuan, Yu Zheng, Xing Xie, Guangzhong Sun]",KDD '11: Proceedings of the 17th ACM SIGKDD in...,,https://doi.org/10.1145/2020408.2020462,21 August 2011,641,2908,This paper presents a Cloud-based system compu...,[https://dl.acm.org/doi/10.1016/j.eswa.2008.07...
256,https://dl.acm.org/doi/10.1145/2487575.2487616,2,Where to find my next passenger,"[Jing Yuan, Yu Zheng, Liuhang Zhang, XIng Xie,...",UbiComp '11: Proceedings of the 13th internati...,,https://doi.org/10.1145/2030112.2030128,17 September 2011,276,2024,We present a recommender for taxi drivers and ...,"[https://dl.acm.org/doi/10.1145/304182.304187,..."
257,https://dl.acm.org/doi/10.1145/2487575.2487616,2,Urban computing with taxicabs,"[Yu Zheng, Yanchi Liu, Jing Yuan, Xing Xie]",UbiComp '11: Proceedings of the 13th internati...,,https://doi.org/10.1145/2030112.2030126,17 September 2011,413,3122,Urban computing for city planning is one of th...,"[https://dl.acm.org/doi/10.5555/645484.656550,..."


In [11]:
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 259 entries, 0 to 258
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   url               259 non-null    object
 1   deapth            259 non-null    int64 
 2   title             230 non-null    object
 3   authors           259 non-null    object
 4   source            230 non-null    object
 5   number and pages  9 non-null      object
 6   doi               187 non-null    object
 7   published         230 non-null    object
 8   citation          232 non-null    object
 9   metric            230 non-null    object
 10  abstract          223 non-null    object
 11  references        259 non-null    object
dtypes: int64(1), object(11)
memory usage: 24.4+ KB


In [13]:
total.to_csv('parsing_results/parsing_articles.csv', index=False)
total.to_excel('parsing_results/parsing_articles.xlsx', index=False)