In [None]:
# Устанавливаем нужные библиотеки для парсинга статей журнала The Economist
!pip install requests
!pip install beautifulsoup4

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:
soup = BeautifulSoup(page.text, features="html.parser")

In [None]:
# Выбираем  ключевые слова, которые, на наш взгляд, наилучшим образом отражают тематику создаваемого корпуса
topics = ['biology', 'ecology', 'medicine', 'genetics', 'virology', 'zoology', 'immunology', 'physiology', 'cell%20biology', 'evolution',
          'microbiology', 'biochemistry']

all_pages = []

# Вложенный цикл
for topic in topics:
    for i in range(1, 11):
        all_pages.append(f'https://www.economist.com/search?q={topic}&page={i}')

In [None]:
# Пишем скрипт для парсинга нужных нам страниц с использованием таймаута 1 сек
import time

text_lst = []
title_lst = []
date_lst = []
link_lst = []

for link_page in all_pages:
  page_results = requests.get(link_page)
  soup = BeautifulSoup(page_results.text, features="html.parser")
  text_raw = soup.find_all('a', class_= '_search-result', href=True)
  for text in text_raw:
    link = text['href']
    time.sleep(1)
    with requests.get(link, stream=True) as page:
      soup_page = BeautifulSoup(page.text, features="html.parser")
      try:
        title = soup_page.find('h1').text
      except:
        print(link)
        break
      date = soup_page.find('time')
      if date:
        date = date.text
        article = ""
        article_text = soup_page.find_all('section')
        for section in article_text:
          if section and section.has_attr('data-body-id'):
            full_text = section.find_all('p')
            for txt in full_text:
              if txt.parent.name != 'audio' and len(txt.text) != 0:
                  article += txt.text
        if len(article) != 0 and title not in title_lst:
          title_lst.append(title)
          date_lst.append(date)
          text_lst.append(article)
          link_lst.append(link)

In [None]:
# Сохраняем спарсенные данные в таблицу
df = pd.DataFrame({'text': text_lst, 'title': title_lst, 'date': date_lst, 'link': link_lst})
df.head()

Unnamed: 0,text,title,date,link
0,FOR THE past four billion years or so the only...,The promise and perils of synthetic biology,Apr 4th 2019,https://www.economist.com/leaders/2019/04/04/t...
1,IN A former leatherworks just off Euston Road ...,Will artificial intelligence help to crack bio...,Jan 7th 2017,https://www.economist.com/science-and-technolo...
2,“How many cells are there in a human being?” I...,The idea of “holobionts” represents a paradigm...,Jun 14th 2023,https://www.economist.com/science-and-technolo...
3,LIVING creatures are jolly useful. Farmers rea...,The remarkable promise of cell-free biology,May 4th 2017,https://www.economist.com/leaders/2017/05/04/t...
4,"A broken brain, hidden inside a skull, is hard...",Better brain biology will deliver better medic...,Sep 21st 2022,https://www.economist.com/technology-quarterly...


In [None]:
# Проверяем на наличие в таблице пропущенных или недоступных данных (NaN)
df.isnull().any()

In [None]:
# Получаем описание датафрейма: сколько строк,столбцов, типы данных, сколько non-null значений
df.info()

In [None]:
# Проверяем на наличие дубликатов
rows = df[df.duplicated(['text'])].sort_values("text")[['text', 'link']]
rows

In [None]:
# Сохраняем полученный датафрейм в формате csv
df.to_csv('biology.csv')

In [None]:
# Сделаем предобработку текста
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')

def cleantext(df):

    # очищаем от знаков препинания и лишних пробелов
    df['cleaned_text'] = df['text'].replace(r'\'|\"|\,|\.|\?|\+|\-|\/|\=|\(|\)|\n|"', '', regex=True)
    df['cleaned_text'] = df['cleaned_text'].replace(r'[[]!"#$%\'()\*+,-./:;<=>?^_`{|}]+',"", regex = True)
    df['cleaned_text'] = df['cleaned_text'].replace("  ", " ")

    # приводим к нижнему регистру
    df['cleaned_text'] = df['cleaned_text'].str.lower()

    # удаляем символы
    df['cleaned_text']  = df['cleaned_text'].replace(r'[^a-zA-Z0-9]', " ", regex=True)

    # удаляем возможные ссылки URL(x):
    df['cleaned_text']  = df['cleaned_text'].replace(r'https.*$', "", regex = True)

    # удаляем стоп-слова
    df['fully_cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stopwords]))


    return df

df = cleantext(df)

In [None]:
# Получаем DataFrame, включающий предобработанные тексты
df.head()

In [None]:
df.info()

In [None]:
df.to_csv('biology_new.csv')

In [None]:
df = pd.read_csv('/content/biology_new.csv')
df = df[['text', 'title', 'date', 'link', 'cleaned_text', 'fully_cleaned_text']]
df.head()

Unnamed: 0,text,title,date,link,cleaned_text,fully_cleaned_text
0,FOR THE past four billion years or so the only...,The promise and perils of synthetic biology,Apr 4th 2019,https://www.economist.com/leaders/2019/04/04/t...,for the past four billion years or so the only...,past four billion years way life earth produce...
1,IN A former leatherworks just off Euston Road ...,Will artificial intelligence help to crack bio...,Jan 7th 2017,https://www.economist.com/science-and-technolo...,in a former leatherworks just off euston road ...,former leatherworks euston road london hopeful...
2,“How many cells are there in a human being?” I...,The idea of “holobionts” represents a paradigm...,Jun 14th 2023,https://www.economist.com/science-and-technolo...,how many cells are there in a human being it...,many cells human sounds like question nerdy pu...
3,LIVING creatures are jolly useful. Farmers rea...,The remarkable promise of cell-free biology,May 4th 2017,https://www.economist.com/leaders/2017/05/04/t...,living creatures are jolly useful farmers rear...,living creatures jolly useful farmers rear ani...
4,"A broken brain, hidden inside a skull, is hard...",Better brain biology will deliver better medic...,Sep 21st 2022,https://www.economist.com/technology-quarterly...,a broken brain hidden inside a skull is harder...,broken brain hidden inside skull harder diagno...


In [None]:
# Посчитаем количество словоупотреблений в корпусе текстов
w = 0
for idx, row in df.iterrows():
  w+= len(row['text'].split())
w

859344

In [None]:
# Количество текстов в корпусе
len(df['text'])

962

Код для сбора корпуса научных статей в дальнейшем будет преобразован под нужды настоящего исследования в цикл для сбора статей в dataset.

In [None]:
# Устанавливаем бибилиотеку для парсинга PDF файлов
! pip install PyPDF2



In [None]:
import PyPDF2
from google.colab import files
data = files.upload()
for item in data.keys():
  print(item)

Saving BOTANY 1.pdf to BOTANY 1 (1).pdf
Saving BOTANY 2.pdf to BOTANY 2 (1).pdf
Saving BOTANY 3.pdf to BOTANY 3 (1).pdf
Saving BOTANY 4.pdf to BOTANY 4 (1).pdf
Saving BOTANY 5.pdf to BOTANY 5 (1).pdf
Saving BOTANY 6.pdf to BOTANY 6 (1).pdf
Saving BOTANY 7.pdf to BOTANY 7 (1).pdf
Saving BOTANY 8.pdf to BOTANY 8 (1).pdf
Saving BOTANY 9.pdf to BOTANY 9 (1).pdf
Saving BOTANY 10.pdf to BOTANY 10 (1).pdf
BOTANY 1 (1).pdf
BOTANY 2 (1).pdf
BOTANY 3 (1).pdf
BOTANY 4 (1).pdf
BOTANY 5 (1).pdf
BOTANY 6 (1).pdf
BOTANY 7 (1).pdf
BOTANY 8 (1).pdf
BOTANY 9 (1).pdf
BOTANY 10 (1).pdf


In [None]:
# Для предобработки текста воспользуемся регулярными выражениями
import re

In [None]:
# Пишем функцию для удаления разделов
class DeleteChapters():
  def __init__(self):
    pass

  def __call__(self, text):
    text_new = re.sub(r"REFERENCES[\w|\W]+|References[\w|\W]+", "", text)
    text_new = re.sub(r"ACKNOWLEDGMENTS[\w|\W]+|Acknowledgments[\w|\W]+", "", text_new)
    return text_new

In [None]:
# используем модуль glob для объединения всех файлов по шаблону в список
import glob

text = ""
files_path = glob.glob("/content/BOTANY*.pdf")
delete_chapter = DeleteChapters()

for path in files_path:
  with open(path, 'rb') as file:
    pdf = PyPDF2.PdfReader(file)
    file_text = ""
    for page in pdf.pages:
      file_text += page.extract_text().strip()
    text += delete_chapter(file_text)
text

'Contents lists available at ScienceDirect\nEnvironmental and Experimental Botany\njournal homepage: www.elsevier.com/locate/envexpbot\nContrasting responses of stomatal conductance and photosynthetic capacity\nto warming and elevated CO 2in the tropical tree species Alchornea\nglandulosa under heatwave conditions\nSophie Fauseta,⁎, Lauana Oliveirab, Marcos S. Buckeridgeb, Christine H. Foyerc, David Galbraitha,\nRakesh Tiwaria, Manuel Gloora\naSchool of Geography, University of Leeds, Leeds, LS2 9JT, UK\nbInstituto de Biociências, Universidade de São Paulo, São Paulo, 05508-090, Brazil\ncCentre for Plant Sciences, University of Leeds, Leeds, LS2 9JT, UK\nARTICLE INFO\nKeywords:\nPhotosynthesisClimate change\nFactorial experiment\nTropical forest\nWarming\nCarbon dioxide\nLeaf temperature\nVcmax\nJ\nmax\nTemperature optimaOpen top chamber\nPhotosynthetic capacityABSTRACT\nFactorial experiments of combined warming and elevated CO 2are rarely performed but essential for our un-\nderstandi

In [None]:
# Получаем количество словоупотреблений в корпусе
s = text
words = s.split()
num_words = len(words)
print("text:", num_words)

text: 122614


In [None]:
def save_file(name, text):
  with open(name, 'w') as file:
    file.write(text)

def replacer(match):
  return match.group(0).replace("-", "").replace("–", "")

In [None]:
save_file('Botany.txt', text)

In [None]:
t = text

# При помощи регулярных выражений производим предобработку корпуса

# Внутритекстовые сылки в круглых скобках
text_new = re.sub(r'\([\w|\W]*\)', "", t)

# Внутритекстовые ссылки в квадратных скобках
text_new = re.sub(r"\[[0-9 ,– -]+\]", "", t) # от ссылок в квадратных скобках внутри текста

# Удаление интернет - ссылок
text_new = re.sub(r"(http[s]*://[www]*|www)+[a-zA-Z0-9.-/#=]+", "", text_new)

# Удаление doi | DOI
text_new = re.sub(r'([article]*/doi/10[.][0-9]{4,}[^\s"/<>]*/[^\s"<>]+)', '', text_new)

# Удаление имен собственных (в настоящем проекте это как раз делать не следует)
text_new = re.sub(r'[^. ][А-ЯA-Z]+[а-яa-z]+[а-яa-z]+', "", text_new)

# удаление переноса
text_new = re.sub('\n', '', text_new)# replacer doen't erase \n

# Удаление дефиса
text_new = re.sub(r"[^ ]+\w+[- –]+\w+[. ,]+", replacer, text_new)
text_new