<a href="https://colab.research.google.com/github/R-Mosolov/notebook-by-google-colab/blob/main/cyberleninka_articles_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip uninstall pandas_profiling
!pip install pandas_profiling

In [None]:
from bs4 import BeautifulSoup
import urllib.request
import requests
import csv
import pandas as pd
import random
import time
from pandas_profiling import ProfileReport

In [None]:
fp = urllib.request.urlopen("https://cyberleninka.ru/article/c/sociology")
mybytes = fp.read()

html_doc = mybytes.decode("utf8")
fp.close()

In [None]:
'''
Create data structure
'''
class Article:
  def __init__(self, date, author):
    self.date = date
    self.author = author

In [None]:
'''
Add date and an author(-s) of a publication
'''
articles = []
unhandled_data = []

for fio in soup.find('ul', attrs={ 'class': 'list' }).find_all('li'):
  unhandled_data.append(fio.span.get_text().split(' / '))

handled_data = []
for data in unhandled_data:
  date = data[0]
  if len(data) == 2 and len(date) == 4 and data[1] != '':
    author = data[1] 
    articles.append(Article(date, author))

for article in articles:
  print(article.date)
  print(article.author)

In [None]:
'''
Save data to a CSV file
'''
with open('cyberleninka-sociology-articles.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=' ',
                            quotechar=';', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(['Publication Date', 'Author'])
    for article in articles:
      spamwriter.writerow([article.date, article.author])

In [None]:
'''
The combinated algorithm to parse authors and publication dates
'''
class Article:
  def __init__(self, date, author):
    self.date = date
    self.author = author

with open('cyberleninka-sociology-articles.csv', 'w', newline='') as csvfile:
  writer = csv.writer(csvfile, delimiter=' ',
                          quotechar=';', quoting=csv.QUOTE_MINIMAL)
  writer.writerow(['Publication Date', 'Author'])

for path_number in range(2, 2492):
  # Step 1
  url = "https://cyberleninka.ru/article/c/sociology/" + str(path_number)
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

  response = requests.get(url, headers=headers)
  html_doc = response.content

  # Step 2
  soup = BeautifulSoup(html_doc, 'html.parser')

  # Step 3
  articles = []
  unhandled_data = []

  for fio in soup.find('ul', attrs={ 'class': 'list' }).find_all('li'):
    unhandled_data.append(fio.span.get_text().split(' / '))

  handled_data = []
  for data in unhandled_data:
    date = data[0]
    if len(data) == 2 and len(date) == 4 and data[1] != '':
      author = data[1] 
      articles.append(Article(date, author))

      # Step 4
      with open('cyberleninka-sociology-articles__correct-delimiter.csv', 'a') as csvfile:
        writer = csv.writer(csvfile, delimiter=' ',
                                quotechar=',', quoting=csv.QUOTE_MINIMAL)
        for article in articles:
          writer.writerow([article.date, article.author])

  for article in articles:
    print(article.date)
    print(article.author)

In [None]:
'''
Analize the dataset by using Pandas Profiling
'''
df = pd.read_csv('/content/drive/MyDrive/Science/Datasets/cyberleninka-sociology-articles/cyberleninka-sociology-articles_1-3__by-S-Yu-Sidorov.csv')

profile = ProfileReport(df, title='CyberLeninka Sociology Articles', explorative=True)

profile

In [None]:
'''
Import proxies to rotate the parser sessions
'''
proxies = pd.read_excel('/content/drive/MyDrive/Science/Datasets/proxies/proxies.xlsx')

# Example to get a proxy: print(proxies['proxy_with_port'][0])

In [None]:
'''
The template of time interval for delay before requests
'''
import random
import time

for i in range(5):
  time_interval = random.randint(1, 5)
  print('Current time interval:', time_interval, 's.')
  time.sleep(time_interval)
  print('Sleep finished')

Current time interval: 1 s.
Sleep finished
Current time interval: 3 s.
Sleep finished
Current time interval: 4 s.
Sleep finished
Current time interval: 1 s.
Sleep finished
Current time interval: 1 s.
Sleep finished


In [None]:
'''
Show HTML structure of pubclications list to find needed tags
'''
soup = BeautifulSoup(html_doc, 'html.parser')

print(soup.prettify())

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="telephone=no" name="format-detection"/>
  <link href="/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
  <link href="https://cyberleninka.ru/article/c/sociology/2" rel="canonical"/>
  <title>
   Темы научных статей по социологическим наукам из каталога электронной библиотеки КиберЛенинка
  </title>
  <meta content="Социологические науки – темы научных статей из каталога электронной библиотеки КиберЛенинка полные тексты научных работ CyberLeninka" name="keywords"/>
  <meta content="Социологические науки – темы научных статей из каталога электронной библиотеки КиберЛенинка" name="description"/>
  <link href="/app.css?4032018937" media="all" rel="stylesheet"/>
  <meta content="

In [None]:
class Article:
  def __init__(self, title, date, author, level, license):
    self.title = title
    self.date = date
    self.author = author
    self.level = level
    self.license = license

articles_html = soup.find_all('li')
articles_meta = []
articles_html

In [None]:
'''
Show paths to get exetended indicators
'''
for article in articles_html:

  # Show titles
  print(article.find('div', { 'class': 'title' }).get_text())

  # Show dates
  print(article.span.get_text().split(' / ')[0])

  # Show authors
  print(article.span.get_text().split(' / ')[1])

  # Show levels
  print(article.find('div', { 'class': 'vak' }))
  print(article.find('div', { 'class': 'scopus' }))
  print(article.find('div', { 'class': 'rsci' }))
  print(article.find('div', { 'class': 'esci' }))

  # Show licenses
  print(article.find('div', { 'class': 'label-cc' }))

  print('--- Конец информации о статье ---')

In [None]:
'''
The combinated algorithm to parse authors and publication dates
with realization of sessions rotation 
'''
# Create data structure
class Article:
  def __init__(self, date, author, title, license, journal_levels):
    self.date = date
    self.author = author
    self.title = title
    self.license = license
    self.journal_levels = journal_levels

# Initialize proxies for parsing
proxies = pd.read_excel('/content/drive/MyDrive/Science/Datasets/proxies/proxies.xlsx')

proxies_with_ports = [
  proxies['proxy_with_port'][0],
  proxies['proxy_with_port'][1],
  proxies['proxy_with_port'][2]
]
proxy_login = proxies['login'][0]
proxy_password = proxies['password'][0]

ip_addresses = [
  'http://' + proxy_login + ':' + proxy_password + '@' + proxies_with_ports[0],
  'http://' + proxy_login + ':' + proxy_password + '@' + proxies_with_ports[1],
  'http://' + proxy_login + ':' + proxy_password + '@' + proxies_with_ports[2]
]
  
articles = []

for path_number in range(497, 2492):

  # Create an user's session
  url = "https://cyberleninka.ru/article/c/sociology/" + str(path_number)

  user_session = requests.Session()
  proxies = {
    'http': ip_addresses[path_number % 3],
    'https': ip_addresses[path_number % 3]
  }

  response = user_session.get(url, proxies=proxies)
  html_doc = response.content

  # Get HTML structure to parse
  soup = BeautifulSoup(html_doc, 'html.parser')
  articles_html = soup.ul.find_all('li')

  # Get needed data
  for article in articles_html:
    year_and_author = article.span.get_text().split(' / ')

    # Get a date
    date = year_and_author[0]
    if len(year_and_author) == 2 and len(date) == 4 and year_and_author[1] != '':
      date = date
    else:
      date = 'EMPTY'

    # Get an author
    author = year_and_author[1]
    if author:
      author = author
    else:
      author = 'EMPTY'

    # Get a title
    title = article.find('div', { 'class': 'title' }).get_text()
    if title:
      title = title
    else:
      title = 'EMPTY'

    # Get a license
    license = article.find('div', { 'class': 'label-cc' })
    if license:
      license = 'Yes'
    else:
      license = 'No'

    # Get a journal levels
    vak = article.find('div', { 'class': 'vak' })
    scopus = article.find('div', { 'class': 'scopus' })
    rsci = article.find('div', { 'class': 'rsci' })
    esci = article.find('div', { 'class': 'esci' })
    journal_levels = []

    if vak:
      journal_levels.append('ВАК')
    if scopus:
      journal_levels.append('Scopus')
    if rsci:
      journal_levels.append('RSCI')
    if esci:
      journal_levels.append('ESCI')
    
    journal_levels_as_string = ''
    for level in journal_levels:
      journal_levels_as_string += level + ' '
    journal_levels = journal_levels_as_string
    
    # Put data about an article to main array
    articles.append(Article(date, author, title, license, journal_levels))

    # Save parsed data as an Excel file
    with open('cyberleninka-sociology-articles__sessions-rotation_part-2.csv', 'w') as csvfile:
      writer = csv.writer(csvfile)
      writer.writerow([
        'article_publication_date',
        'article_author',
        'article_title',
        'article_license',
        'journal_levels'
      ])
      for article in articles:
        writer.writerow([
          article.date,
          article.author,
          article.title,
          article.license,
          article.journal_levels
        ])

  # Show a log of parser process
  for article in articles:
    print(article.date)
    print(article.author)
    print(article.title)
    print(article.license)
    print(article.journal_levels)
  
  # Set time interval between user sessions
  time_interval = random.randint(3, 5)
  time.sleep(time_interval)
  print(
    '--- Sleep finished in', time_interval, '.',
    'The current proxy:', ip_addresses[path_number % 3], '---'
  )