<a href="https://colab.research.google.com/github/R-Mosolov/notebook-by-google-colab/blob/main/cyberleninka_articles_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip uninstall pandas_profiling
!pip install pandas_profiling

In [None]:
from bs4 import BeautifulSoup
import urllib.request
import requests
import csv
import pandas as pd
from pandas_profiling import ProfileReport

In [61]:
fp = urllib.request.urlopen("https://cyberleninka.ru/article/c/sociology")
mybytes = fp.read()

html_doc = mybytes.decode("utf8")
fp.close()

In [None]:
'''
Create data structure
'''
class Article:
  def __init__(self, date, author):
    self.date = date
    self.author = author

In [None]:
'''
Add date and an author(-s) of a publication
'''
articles = []
unhandled_data = []

for fio in soup.find('ul', attrs={ 'class': 'list' }).find_all('li'):
  unhandled_data.append(fio.span.get_text().split(' / '))

handled_data = []
for data in unhandled_data:
  date = data[0]
  if len(data) == 2 and len(date) == 4 and data[1] != '':
    author = data[1] 
    articles.append(Article(date, author))

for article in articles:
  print(article.date)
  print(article.author)

In [None]:
'''
Save data to a CSV file
'''
with open('cyberleninka-sociology-articles.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=' ',
                            quotechar=';', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(['Publication Date', 'Author'])
    for article in articles:
      spamwriter.writerow([article.date, article.author])

In [None]:
'''
The combinated algorithm to parse authors and publication dates
'''
class Article:
  def __init__(self, date, author):
    self.date = date
    self.author = author

with open('cyberleninka-sociology-articles.csv', 'w', newline='') as csvfile:
  writer = csv.writer(csvfile, delimiter=' ',
                          quotechar=';', quoting=csv.QUOTE_MINIMAL)
  writer.writerow(['Publication Date', 'Author'])

for path_number in range(2, 2492):
  # Step 1
  url = "https://cyberleninka.ru/article/c/sociology/" + str(path_number)
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

  response = requests.get(url, headers=headers)
  html_doc = response.content

  # Step 2
  soup = BeautifulSoup(html_doc, 'html.parser')

  # Step 3
  articles = []
  unhandled_data = []

  for fio in soup.find('ul', attrs={ 'class': 'list' }).find_all('li'):
    unhandled_data.append(fio.span.get_text().split(' / '))

  handled_data = []
  for data in unhandled_data:
    date = data[0]
    if len(data) == 2 and len(date) == 4 and data[1] != '':
      author = data[1] 
      articles.append(Article(date, author))

      # Step 4
      with open('cyberleninka-sociology-articles__correct-delimiter.csv', 'a') as csvfile:
        writer = csv.writer(csvfile, delimiter=' ',
                                quotechar=',', quoting=csv.QUOTE_MINIMAL)
        for article in articles:
          writer.writerow([article.date, article.author])

  for article in articles:
    print(article.date)
    print(article.author)

In [None]:
'''
Analize the dataset by using Pandas Profiling
'''
df = pd.read_csv('/content/drive/MyDrive/Science/Datasets/cyberleninka-sociology-articles/cyberleninka-sociology-articles_1-3__by-S-Yu-Sidorov.csv')

profile = ProfileReport(df, title='CyberLeninka Sociology Articles', explorative=True)

profile

In [None]:
'''
The research target #1:
Average quantity of authors in common publications
'''
authors = df['Author']
authors

ordinary_publications = 0
common_publications = 0

two_co_authors = 0
three_co_authors = 0
four_co_authors = 0
five_co_authors = 0
more_than_five_co_authors = 0

for author in authors:
  if len(author.split(', ')) > 1:
    common_publications += 1

    # Calculate co-authors quantity
    if len(author.split(', ')) == 2:
      two_co_authors += 1
    if len(author.split(', ')) == 3:
      three_co_authors += 1
    if len(author.split(', ')) == 4:
      four_co_authors += 1
    if len(author.split(', ')) == 5:
      five_co_authors += 1
    if len(author.split(', ')) > 5:
      more_than_five_co_authors += 1
  else:
    ordinary_publications += 1

print('--- Статистика по публикациям ---')
print('Всего публикаций:', ordinary_publications + common_publications, 'шт.')
print('Одиночных публикаций:', ordinary_publications, 'шт.')
print('Коллективных публикаций:', common_publications, 'шт.')

print('--- Статистика по соавторам ---')
print('Публикаций с 2 соавторами:', two_co_authors, 'шт.')
print('Публикаций с 3 соавторами:', three_co_authors, 'шт.')
print('Публикаций с 4 соавторами:', four_co_authors, 'шт.')
print('Публикаций с 5 соавторами:', five_co_authors, 'шт.')
print('Публикаций с более, чем 5 соавторами:', more_than_five_co_authors, 'шт.')

--- Статистика по публикациям ---
Всего публикаций: 506459 шт.
Одиночных публикаций: 355787 шт.
Коллективных публикаций: 150672 шт.
--- Статистика по соавторам ---
Публикаций с 2 соавторами: 108260 шт.
Публикаций с 3 соавторами: 31528 шт.
Публикаций с 4 соавторами: 7597 шт.
Публикаций с 5 соавторами: 2033 шт.
Публикаций с более, чем 5 соавторами: 1254 шт.


In [None]:
'''
Import proxies to rotate the parser sessions
'''
proxies = pd.read_excel('/content/drive/MyDrive/Science/Datasets/proxies/proxies.xlsx')

# Example to get a proxy: print(proxies['proxy'][0])

In [72]:
'''
The template of time interval for delay before requests
'''
import random
import time

for i in range(5):
  time_interval = random.randint(1, 5)
  print('Current time interval:', time_interval, 's.')
  time.sleep(time_interval)
  print('Sleep finished')

Current time interval: 1 s.
Sleep finished
Current time interval: 3 s.
Sleep finished
Current time interval: 4 s.
Sleep finished
Current time interval: 1 s.
Sleep finished
Current time interval: 1 s.
Sleep finished


In [None]:
'''
Show HTML structure of pubclications list to find needed tags
'''
soup = BeautifulSoup(html_doc, 'html.parser')

print(soup.prettify())

In [75]:
class Article:
  def __init__(self, title, date, author, level, license):
    self.title = title
    self.date = date
    self.author = author
    self.level = level
    self.license = license

articles_html = soup.find_all('li')
articles_meta = []
articles_html

[<li>
 <a href="/article/n/metodologicheskie-problemy-sravnitelnogo-analiza-paradigm-tehnicheskoy-realnosti"><div class="title">Методологические проблемы сравнительного анализа парадигм технической реальности</div>
 <p>Представлено исследование актуальной философской проблемы разработки методологических принципов анализа современной технической реальности. Рассматриваются основные подходы к ее анализу с позиций философских, естественнонаучных и социально-гуманитарных парадигм. Отмечается качественное изменение и...</p>
 <span>2008 / Елькина Елена Евграфовна</span>
 <div class="labels">
 <div class="label vak">ВАК</div>
 <div class="label-cc"><img class="black" src="/images/tsvg/cc-label.svg"/></div></div>
 </a>
 </li>, <li>
 <a href="/article/n/politika-upravleniya-chelovecheskimi-resursami"><div class="title">Политика управления человеческими ресурсами</div>
 <p></p>
 <span>2001 / Быченко Юрий Григорьевич</span>
 <div class="labels">
 <div class="label vak">ВАК</div>
 <div class="labe

In [92]:
'''
Show paths to get exetended indicators
'''
for article in articles_html:

  # Show titles
  print(article.find('div', { 'class': 'title' }).get_text())

  # Show dates
  print(article.span.get_text().split(' / ')[0])

  # Show authors
  print(article.span.get_text().split(' / ')[1])

  # Show levels
  print(article.find('div', { 'class': 'vak' }))
  print(article.find('div', { 'class': 'scopus' }))
  print(article.find('div', { 'class': 'rsci' }))
  print(article.find('div', { 'class': 'esci' }))

  # Show licenses
  print(article.find('div', { 'class': 'label-cc' }))

  print('--- Конец информации о статье ---')

Методологические проблемы сравнительного анализа парадигм технической реальности
2008
Елькина Елена Евграфовна
<div class="label vak">ВАК</div>
None
None
None
<div class="label-cc"><img class="black" src="/images/tsvg/cc-label.svg"/></div>
--- Конец информации о статье ---
Политика управления человеческими ресурсами
2001
Быченко Юрий Григорьевич
<div class="label vak">ВАК</div>
None
None
None
<div class="label-cc"><img class="black" src="/images/tsvg/cc-label.svg"/></div>
--- Конец информации о статье ---
Уровень и качество жизни населения
2009
Чуличков Евгений Анатольевич
None
None
None
None
<div class="label-cc"><img class="black" src="/images/tsvg/cc-label.svg"/></div>
--- Конец информации о статье ---
Declaration of the Convention of Independent Sociological Centers of Russia
2009

None
<div class="label scopus">Scopus</div>
<div class="label rsci">RSCI</div>
<div class="label esci">ESCI</div>
<div class="label-cc"><img class="black" src="/images/tsvg/cc-label.svg"/></div>
--- Коне

AttributeError: ignored