<a href="https://colab.research.google.com/github/R-Mosolov/sociology-scientometric-analysis/blob/main/parse_articles_links.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from bs4 import BeautifulSoup
import urllib.request
import requests
import csv
import pandas as pd
import random
import time

In [2]:
'''
Define a tag containing an article link
'''
fp = urllib.request.urlopen("https://cyberleninka.ru/article/c/sociology")
mybytes = fp.read()

html_doc = mybytes.decode("utf8")
fp.close()

# Get HTML structure to parse
soup = BeautifulSoup(html_doc, 'html.parser')
articles_html = soup.ul.find_all('li')

articles_html

[<li>
 <a href="/article/n/metodologicheskie-problemy-sravnitelnogo-analiza-paradigm-tehnicheskoy-realnosti"><div class="title">Методологические проблемы сравнительного анализа парадигм технической реальности</div>
 <p>Представлено исследование актуальной философской проблемы разработки методологических принципов анализа современной технической реальности. Рассматриваются основные подходы к ее анализу с позиций философских, естественнонаучных и социально-гуманитарных парадигм. Отмечается качественное изменение и...</p>
 <span>2008 / Елькина Елена Евграфовна</span>
 <div class="labels">
 <div class="label vak">ВАК</div>
 <div class="label-cc"><img class="black" src="/images/tsvg/cc-label.svg"/></div></div>
 </a>
 </li>, <li>
 <a href="/article/n/politika-upravleniya-chelovecheskimi-resursami"><div class="title">Политика управления человеческими ресурсами</div>
 <p></p>
 <span>2001 / Быченко Юрий Григорьевич</span>
 <div class="labels">
 <div class="label vak">ВАК</div>
 <div class="labe

In [3]:
'''
Parse articles links
'''

for article in articles_html:
  print(article.a['href'])

/article/n/metodologicheskie-problemy-sravnitelnogo-analiza-paradigm-tehnicheskoy-realnosti
/article/n/politika-upravleniya-chelovecheskimi-resursami
/article/n/uroven-i-kachestvo-zhizni-naseleniya
/article/n/declaration-of-the-convention-of-independent-sociological-centers-of-russia
/article/n/semeynyy-diagnoz-v-opekaemyh-semyah
/article/n/kultura-vlasti-v-kontekste-geo-hronopolitiki
/article/n/novye-knigi-professorov-i-prepodavateley-moskovskogo-gumanitarnogo-universitet-o-molodezhi
/article/n/vlast-i-nauka-k-voprosu-o-razvitii-kulturno-obrazovatelnogo-potentsiala
/article/n/sravnenie-obektivistskogo-i-subektivistskogo-podhodov-k-izmereniyu-sinteticheskih-latentnyh-kategoriy-kachestva-zhizni-naseleniya
/article/n/sotsialnaya-zaschita-grazhdan-uvolennyh-s-voennoy-sluzhby-kak-sotsialnyy-institut
/article/n/kontseptsiya-sotsiostrukturnoy-funktsii-obrazovaniya-v-kontekste-ego-sovremennoy-modernizatsii
/article/n/samootsenka-v-sotsiologicheskih-issledovaniyah
/article/n/motivatsionnaya-so

In [None]:
'''
The combinated algorithm to parse authors and publication dates
with realization of sessions rotation 
'''
# Create data structure
class Article:
  def __init__(self, date, author, title, license, journal_levels, link):
    self.date = date
    self.author = author
    self.title = title
    self.license = license
    self.journal_levels = journal_levels
    self.link = link

# Set condition to clear a buffer stopping parser's work
isColumnName = True

# Initialize proxies for parsing
proxies = pd.read_excel('/content/drive/MyDrive/Science/Datasets/proxies/proxies.xlsx')

proxies_with_ports = [
  proxies['proxy_with_port'][0],
  proxies['proxy_with_port'][1],
  proxies['proxy_with_port'][2]
]
proxy_login = proxies['login'][0]
proxy_password = proxies['password'][0]

ip_addresses = [
  'http://' + proxy_login + ':' + proxy_password + '@' + proxies_with_ports[0],
  'http://' + proxy_login + ':' + proxy_password + '@' + proxies_with_ports[1],
  'http://' + proxy_login + ':' + proxy_password + '@' + proxies_with_ports[2]
]
  
articles = []

for path_number in range(2, 2492): # ATTENTION: Check this value beafore parsing
  try:

    # Create an user's session
    url = "https://cyberleninka.ru/article/c/sociology/" + str(path_number)

    user_session = requests.Session()
    proxies = {
      'http': ip_addresses[path_number % 3],
      'https': ip_addresses[path_number % 3]
    }

    response = user_session.get(url, proxies=proxies)
    html_doc = response.content

    # Get HTML structure to parse
    soup = BeautifulSoup(html_doc, 'html.parser')
    articles_html = soup.ul.find_all('li')

    # Get needed data
    for article in articles_html:
      year_and_author = article.span.get_text().split(' / ')

      # Get a date
      date = year_and_author[0]
      if len(year_and_author) == 2 and len(date) == 4 and year_and_author[1] != '':
        date = date
      else:
        date = 'EMPTY'

      # Get an author
      author = year_and_author[1]
      if author:
        author = author
      else:
        author = 'EMPTY'

      # Get a title
      title = article.find('div', { 'class': 'title' }).get_text()
      if title:
        title = title
      else:
        title = 'EMPTY'

      # Get a license
      license = article.find('div', { 'class': 'label-cc' })
      if license:
        license = 'Yes'
      else:
        license = 'No'

      # Get a journal levels
      vak = article.find('div', { 'class': 'vak' })
      scopus = article.find('div', { 'class': 'scopus' })
      rsci = article.find('div', { 'class': 'rsci' })
      esci = article.find('div', { 'class': 'esci' })
      journal_levels = []

      if vak:
        journal_levels.append('ВАК')
      if scopus:
        journal_levels.append('Scopus')
      if rsci:
        journal_levels.append('RSCI')
      if esci:
        journal_levels.append('ESCI')
      
      journal_levels_as_string = ''
      if journal_levels:
        for level in journal_levels:
          journal_levels_as_string += level + ' '
        journal_levels = journal_levels_as_string
      else:
        journal_levels = 'EMPTY'

      # Get a link
      link = article.a['href']
      if author:
        link = link
      else:
        link = 'EMPTY'
      
      # Put data about an article to main array
      articles.append(Article(date, author, title, license, journal_levels, link))

      # Save parsed data as an Excel file
      with open('cyberleninka-sociology-articles__1-7-2_with-article-links.csv', 'a') as csvfile:
        writer = csv.writer(csvfile)
        if isColumnName == True:
          writer.writerow([
            'article_publication_date',
            'article_author',
            'article_title',
            'article_license',
            'journal_levels',
            'article_link'
          ])
          isColumnName = False
        for article in articles:
          writer.writerow([
            article.date,
            article.author,
            article.title,
            article.license,
            article.journal_levels,
            article.link
          ])

      # Clear buffer to continue parsing
      articles = []

    # Show a log of parser process
    for article in articles:
      print(article.date)
      print(article.author)
      print(article.title)
      print(article.license)
      print(article.journal_levels)
      print(article.link)
    
    # Set time interval between user sessions
    time_interval = random.randint(3, 5)
    time.sleep(time_interval)
    print(
      '--- Sleep finished in', time_interval, '.',
      'The current proxy:', ip_addresses[path_number % 3], '---'
    )

  # Ignore all errors stoping parsing process
  except:
    pass