In [1]:
# !pip install selenium

In [2]:
# !apt-get install chromium-chromedriver

In [3]:
# !pip install bs4

In [None]:
# !pip install fake-useragent

In [4]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
import time
import random
import re
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent

In [7]:
def get_index(journal_name):
    '''
    Chooses correct html element from journal_name list 
    based on font color
    '''
    name_index = None
    for index, name in enumerate(journal_name):
        if ("font color=\"#F26C4F\"" in str(name)):
            name_index = index
    return name_index

In [8]:
def get_data_from_publication_activity_page(link, driver_1):
    '''
    Retrievs id,  journal name and number of publications
    per each year from publication activity page
    '''
    driver_1.get(link)
    soup1 = BeautifulSoup(driver_1.page_source, 'html.parser')
    
    # Extracting id fro the link
    data_from_publication_activity_page = []
    id_ = int(link.split('=')[-1])
    data_from_publication_activity_page.append(id_)
    
    # Getting the journal name
    journal_name = soup1.find_all('a',href= re.compile('^title_about'))
    index_of_required_tag = get_index(journal_name) # chooses correct html element from journal_name
    data_from_publication_activity_page.append(journal_name[index_of_required_tag].text)
    
    # Getting number of publications per eahc year in the range from 2011 to 2020
    font_tag = soup1.find("font", text = "Число статей в РИНЦ")
    td_tag = font_tag.parent # parent of font tag above
    first_table_row = td_tag.parent
    soup1 = BeautifulSoup(str(first_table_row), 'html.parser') # put in html string of first row for parsing
    cells_of_first_row = soup1.find_all('td')[2:] # [2:] excludes first two cells
    for cell in cells_of_first_row:
        '''
        appends number of publications per year
        '''
        data_from_publication_activity_page.append(int(cell.text))
                                                       
    return data_from_publication_activity_page


### Заполнение формы для поиска

In [9]:
url = 'https://elibrary.ru/titles.asp' # url address of the website

In [10]:
# changing user agent to avoid captcha
options = Options()
ua = UserAgent()
userAgent = ua.random
options.add_argument(f'user-agent={userAgent}')
options.add_argument("--headless") # comment out to see the browser window
driver = webdriver.Chrome(options=options, executable_path=r"/usr/lib/chromium-browser/chromedriver")
driver.get(url)

In [11]:
country_dropdown = Select(driver.find_element_by_name("countryid"))
country_dropdown.select_by_value("RUS")

In [12]:
language_dropdown = Select(driver.find_element_by_name("language"))
language_dropdown.select_by_value("RU")

In [13]:
rubric_dropdown = Select(driver.find_element_by_name("rubriccode"))
rubric_dropdown.select_by_value("200000")


In [14]:
driver.find_element_by_css_selector("div[onclick='title_search()']").click() # click on search button

In [15]:
# number of journals found
html = driver.page_source
soup_n = BeautifulSoup(html, 'html.parser')
n_journals = int(soup_n.find("td", class_ = "redref").b.text)

### Цикл по всем страницам результата поиска для сбора данных

In [16]:
# creating empty dataframe to store results
years = ['n_papers_risc_' + str(year) for year in range(2011, 2021)]
result_table = pd.DataFrame(columns = ['id', 'name'] + years)
result_table.head()

Unnamed: 0,id,name,n_papers_risc_2011,n_papers_risc_2012,n_papers_risc_2013,n_papers_risc_2014,n_papers_risc_2015,n_papers_risc_2016,n_papers_risc_2017,n_papers_risc_2018,n_papers_risc_2019,n_papers_risc_2020


In [17]:
num_of_pages = (n_journals % 100) + 1 # there are 100 journals on the page
#driver_1 = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver") # another instance of the browser
rate = [i/10 for i in range(1,11)] # list for randomizing our request rate

In [18]:
for i in range(num_of_pages):
    # changing user agent to avoid captcha
    userAgent = ua.random
    options.add_argument(f'user-agent={userAgent}')
    options.add_argument("--headless") # comment out to see the browser window
    driver_1 = webdriver.Chrome(options=options, executable_path=r"/usr/lib/chromium-browser/chromedriver")
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    links = []
    for link in soup.find_all('a',href=True, title = "Анализ публикационной активности журнала"):  # in html anchor/link is represented by the tag <a>
        links.append('https://elibrary.ru/'+link.get('href')) 
    for link in links:
        temp_list = get_data_from_publication_activity_page(link, driver_1)
        result_table = result_table.append(pd.DataFrame([temp_list], columns = ['id', 'name'] + years), ignore_index = True)
        time.sleep(random.choice(rate)) # Avoiding web scraping detection by randomizing our request rate to closely mimic human interaction
    driver_1.quit()
    try:
        driver.find_element_by_css_selector("a[title='Следующая страница']").click() # click on next page button
    except NoSuchElementException:
        break
    
    
    
    

In [25]:
driver.quit() # cleaning up resources

In [26]:
result_table.to_excel("Informatics_Journals_RISC.xlsx", index = False) 

In [27]:
result_table.to_csv('Informatics_Journals_RISC.csv', index=False, encoding='utf-8')