<a href="https://colab.research.google.com/github/R-Mosolov/sociology-scientometric-analysis/blob/main/parse_articles_content.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from bs4 import BeautifulSoup
import urllib.request
import requests
import csv
import pandas as pd
import random
import time
import os

In [None]:
'''
Show an article content
'''
fp = urllib.request.urlopen("https://cyberleninka.ru/article/n/metodologicheskie-problemy-sravnitelnogo-analiza-paradigm-tehnicheskoy-realnosti")
mybytes = fp.read()

html_doc = mybytes.decode("utf8")
fp.close()

# Get HTML structure to parse
soup = BeautifulSoup(html_doc, 'html.parser')
article_html = soup

article_html

In [None]:
article_html.body.find('i', attrs={ 'itemprop': 'keywords' }).find_all('span')

In [None]:
'''
Define parse targets with usefull information
'''
views = article_html.body.find('div', attrs={ 'class': 'views' }).get_text()
downloads = article_html.body.find('div', attrs={ 'class': 'downloads' }).get_text()
journal_title = article_html.body.find('div', attrs={ 'class': 'half' }).span.a.get_text()
journal_link = article_html.body.find('div', attrs={ 'class': 'half' }).span.a['href']
abstract = article_html.body.find('div', attrs={ 'class': 'abstract' }).p.get_text()

# Integrate all key words
key_words_html = article_html.body.find('i', attrs={ 'itemprop': 'keywords' }).find_all('span')
key_words = []
for key_word in key_words_html:
  key_words.append(key_word.get_text().lower())

# Integrate all article paragraphs
article_text_html = article_html.body.find_all('p')
article_text = []
for article_paragraph in article_text_html:
  article_text.append(article_paragraph.get_text())

print('--- Содержимое статьи ---')
print('Просмотров:', views)
print('Скачиваний:', downloads)
print('Название журнала:', journal_title)
print('Гиперссылка журнала:', journal_link)
print('Ключевые слова:', key_words)
print('Аннотация:', abstract)
print('Текст статьи:', article_text)

In [None]:
'''
Show information about the dataset with articles links
'''
df = pd.read_csv('/content/drive/MyDrive/Science/Datasets/cyberleninka-sociology-articles/cyberleninka-sociology-articles__1-7-4_gen-with-article-links.csv')

print('Датасет с ссылками на статьи содержит:', len(df), 'строк')

df.head(5)

In [None]:
'''
Put all articles links into an one array
'''
articles_links = df['article_link']
only_articles_links = []

for article_link in articles_links:
  only_articles_links.append(article_link)

In [None]:
'''
The combinated algorithm to parse articles informations
with realization of sessions rotation 
'''
# Create data structure
class Article:
  def __init__(
    self, article_number, article_link, article_views, article_downloads,
    journal_title, journal_link, key_words, abstract, article_text
  ):
    self.article_number = article_number
    self.article_link = article_link
    self.article_views = article_views
    self.article_downloads = article_downloads
    self.journal_title = journal_title
    self.journal_link = journal_link
    self.key_words = key_words
    self.abstract = abstract
    self.article_text = article_text

# Set condition to clear a buffer stopping parser's work
is_column_name = True
proxies_quantity = 6
result_file_name = 'cyberleninka-sociology-articles__2-1-2_articles-content.csv'
EMPTY = 'EMPTY'
ERROR = 'ERROR'
df = pd.read_csv('/content/drive/MyDrive/Science/Datasets/cyberleninka-sociology-articles/cyberleninka-sociology-articles__1-7-4_gen-with-article-links.csv')

# Put all articles links into an one array
articles_links = df['article_link']
only_articles_links = []

for article_link in articles_links:
  only_articles_links.append(article_link)

# TODO: Delete this block after finishing the parser work
resized_df = slice(2646, len(only_articles_links))
only_articles_links = only_articles_links[resized_df]

# Initialize proxies for parsing
proxies = pd.read_excel('/content/drive/MyDrive/Science/Datasets/proxies/proxies.xlsx')

proxies_with_ports = [
  proxies['proxy_with_port'][0],
  proxies['proxy_with_port'][1],
  proxies['proxy_with_port'][2],
  proxies['proxy_with_port'][3],
  proxies['proxy_with_port'][4],
  proxies['proxy_with_port'][5]
]
proxy_login = proxies['login'][0]
proxy_password = proxies['password'][0]

ip_addresses = [
  'http://' + proxy_login + ':' + proxy_password + '@' + proxies_with_ports[0],
  'http://' + proxy_login + ':' + proxy_password + '@' + proxies_with_ports[1],
  'http://' + proxy_login + ':' + proxy_password + '@' + proxies_with_ports[2],
  'http://' + proxy_login + ':' + proxy_password + '@' + proxies_with_ports[3],
  'http://' + proxy_login + ':' + proxy_password + '@' + proxies_with_ports[4],
  'http://' + proxy_login + ':' + proxy_password + '@' + proxies_with_ports[5]
]
  
articles = []
counter = 0

# Remove old file
try:
  os.remove('/content/' + result_file_name)
except:
  pass

for article_link in only_articles_links:
  # Increase an article index
  counter += 1
    
  try:

    # Create an user's session
    url = "https://cyberleninka.ru" + str(article_link)

    user_session = requests.Session()
    proxies = {
      'http': ip_addresses[counter % proxies_quantity],
      'https': ip_addresses[counter % proxies_quantity]
    }

    response = user_session.get(url, proxies=proxies)
    html_doc = response.content

    # Get HTML structure to parse
    soup = BeautifulSoup(html_doc, 'html.parser')
    article_html = soup

    # Get an article views
    try:
      article_views = article_html.body.find('div', attrs={ 'class': 'views' }).get_text()
      if article_views:
        article_views = article_views
      else:
        article_views = EMPTY
    except:
      article_views = ERROR

    # Get an article downloads
    try:
      article_downloads = article_html.body.find('div', attrs={ 'class': 'downloads' }).get_text()
      if article_downloads:
        article_downloads = article_downloads
      else:
        article_downloads = EMPTY
    except:
      article_downloads = ERROR

    # Get a journal title
    try:
      journal_title = article_html.body.find('div', attrs={ 'class': 'half' }).span.a.get_text()
      if journal_title:
        journal_title = journal_title
      else:
        journal_title = EMPTY
    except:
      journal_title = ERROR

    # Get a journal link
    try:
      journal_link = article_html.body.find('div', attrs={ 'class': 'half' }).span.a['href']
      if journal_link:
        journal_link = journal_link
      else:
        journal_link = EMPTY
    except:
      journal_link = ERROR

    # Get an article key words
    try:
      key_words_html = article_html.body.find('i', attrs={ 'itemprop': 'keywords' }).find_all('span')
      key_words = []
      for key_word in key_words_html:
        key_words.append(key_word.get_text().lower())

      if key_words:
        key_words = key_words
      else:
        key_words = EMPTY
    except:
      key_words = ERROR

    # Get an article abstract
    try:
      abstract = article_html.body.find('div', attrs={ 'class': 'abstract' }).p.get_text()
      if abstract:
        abstract = abstract
      else:
        abstract = EMPTY
    except:
      abstract = ERROR

    # Get an article text
    try:
      article_text_html = article_html.body.find_all('p')
      article_text = []
      for article_paragraph in article_text_html:
        article_text.append(article_paragraph.get_text())

      if article_text:
        article_text = article_text
      else:
        article_text = EMPTY
    except:
      article_text = ERROR
  
  except:
    article_link = ERROR
    article_views = ERROR
    article_downloads = ERROR
    journal_title = ERROR
    journal_link = ERROR
    key_words = ERROR
    abstract = ERROR
    article_text = ERROR

  # Put data about an article to main array
  articles.append(Article(
    counter, article_link, article_views, article_downloads,
    journal_title, journal_link, key_words, abstract, article_text
  ))

  # Save parsed data as an Excel file
  with open(result_file_name, 'a') as csvfile:
    writer = csv.writer(csvfile)
    if is_column_name == True:
      writer.writerow([
        'article_number',
        'article_link',
        'article_views',
        'article_downloads',
        'journal_title',
        'journal_link',
        'key_words',
        'abstract',
        'article_text'
      ])
      is_column_name = False
    for article in articles:
      writer.writerow([
        article.article_number,
        article.article_link,
        article.article_views,
        article.article_downloads,
        article.journal_title,
        article.journal_link,
        article.key_words,
        article.abstract,
        article.article_text
      ])

  # Clear buffer to minimize stoping the parser work
  articles = []

  # Set time interval between user sessions
  time_interval = random.randint(3, 5) # TODO: Change it before running the parser
  time.sleep(time_interval)
  print('Now, the following proxy has used:', ip_addresses[counter % proxies_quantity].split('@')[1])
  print('Sleep finished in', time_interval, 's.')