# Предварительная обработка текстовых данных перед их передачей в модели Machine Learning.

## Первый корпус (*BIOcorpus*) базируется на материале 251 научной статьи разного типа (research article, review article), которые были опубликованы в период с 2010 по 2019 гг., из междисциплинарных и специализированных периодических журналов по 7 дисциплинарным направлениям (биохимия, биофизика, биотехнология/биоинженерия, клеточная биология, генетика, гидробиология, молекулярная биология).

* Предобработка медико-биологических текстов требует особого подхода из-за превалирования в них специальной терминологии, часто представленной цепочками число-буквенных последовательностей и/или аббревиатур.

## Для предобработки текстов BIOcorpus была использована модель scispaCy, созданная специально для обработки медико-биологических текстов (малая модель для 100тыс. словарных ед). Также был использован EntityLinker - компонент SpaCy, который выполняет связывание с базой знаний (в нашем случае с  the Unified Medical Language System, содержащей ~3M концептов. Linker  выполняет поиск  именованных сущностей (от char до 3grams), сравнивая их с концепциями в базе знаний с помощью алгоритма поиска ближайших соседей.

https://github.com/allenai/scispacy

Neumann, M., King, D., Beltagy, I., Ammar, W. (2019). ScispaCy: Fast and Robust Models for
Biomedical Natural Language Processing, Proceedings of the 18th BioNLP Workshop and Shared Task (pp. 319–327), Florence: Association for Computational Linguistics.

In [None]:
! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz
  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz (14.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.8/14.8 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: en-core-sci-sm
  Building wheel for en-core-sci-sm (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-sci-sm: filename=en_core_sci_sm-0.5.4-py3-none-any.whl size=14778490 sha256=5bfbb75bc5d4f219409ae926b8e2bada3227beac940f8e478ce88dfe02a05877
  Stored in directory: /root/.cache/pip/wheels/0e/d3/6e/e03165bcf8c0fe90c7a41e8a44dd268e4c7779582c5e022707
Successfully built en-core-sci-sm
Installing collected packages: en-core-sci-sm
Successfully installed en-core-sci-sm-0.5.4


In [None]:
! pip install scispacy

Collecting scispacy
  Downloading scispacy-0.5.4-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m865.1 kB/s[0m eta [36m0:00:00[0m
Collecting scipy<1.11 (from scispacy)
  Downloading scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting conllu (from scispacy)
  Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Collecting nmslib>=1.7.3.6 (from scispacy)
  Downloading nmslib-2.1.1.tar.gz (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pysbd (from scispacy)
  Downloading pysbd-0.3.4-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.1/71.1 kB[0m [31m10.4 MB/s[0m eta [3

In [None]:
# Функция для подсчета словоупотреблений в корпусе
def count_words(df):
    w = 0
    for idx, row in df.iterrows():
        w += len(row['text'].split())
    return w

In [None]:
import scispacy.linking
import scispacy
import spacy

import pandas as pd
import numpy as np

import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stops = stopwords.words("english")

nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nlp = spacy.load("en_core_sci_sm")
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectors_sparse.npz not found in cache, downloading to /tmp/tmpe4mbq0ca


100%|██████████| 492M/492M [00:12<00:00, 39.9MiB/s]


Finished download, copying /tmp/tmpe4mbq0ca to cache at /root/.scispacy/datasets/2b79923846fb52e62d686f2db846392575c8eb5b732d9d26cd3ca9378c622d40.87bd52d0f0ee055c1e455ef54ba45149d188552f07991b765da256a1b512ca0b.tfidf_vectors_sparse.npz
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/nmslib_index.bin not found in cache, downloading to /tmp/tmpw_vzn1yn


100%|██████████| 724M/724M [00:33<00:00, 22.7MiB/s]


Finished download, copying /tmp/tmpw_vzn1yn to cache at /root/.scispacy/datasets/7e8e091ec80370b87b1652f461eae9d926e543a403a69c1f0968f71157322c25.6d801a1e14867953e36258b0e19a23723ae84b0abd2a723bdd3574c3e0c873b4.nmslib_index.bin
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectorizer.joblib not found in cache, downloading to /tmp/tmp1t8y9nay


100%|██████████| 1.32M/1.32M [00:00<00:00, 3.31MiB/s]
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Finished download, copying /tmp/tmp1t8y9nay to cache at /root/.scispacy/datasets/37bc06bb7ce30de7251db5f5cbac788998e33b3984410caed2d0083187e01d38.f0994c1b61cc70d0eb96dea4947dddcb37460fb5ae60975013711228c8fe3fba.tfidf_vectorizer.joblib


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/concept_aliases.json not found in cache, downloading to /tmp/tmpqciibawn


100%|██████████| 264M/264M [00:07<00:00, 36.8MiB/s]


Finished download, copying /tmp/tmpqciibawn to cache at /root/.scispacy/datasets/6238f505f56aca33290aab44097f67dd1b88880e3be6d6dcce65e56e9255b7d4.d7f77b1629001b40f1b1bc951f3a890ff2d516fb8fbae3111b236b31b33d6dcf.concept_aliases.json
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/kbs/2023-04-23/umls_2022_ab_cat0129.jsonl not found in cache, downloading to /tmp/tmp_42sg0fq


100%|██████████| 628M/628M [00:55<00:00, 11.9MiB/s]


Finished download, copying /tmp/tmp_42sg0fq to cache at /root/.scispacy/datasets/d5e593bc2d8adeee7754be423cd64f5d331ebf26272074a2575616be55697632.0660f30a60ad00fffd8bbf084a18eb3f462fd192ac5563bf50940fc32a850a3c.umls_2022_ab_cat0129.jsonl
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv not found in cache, downloading to /tmp/tmpp045ylzg


100%|██████████| 4.26k/4.26k [00:00<00:00, 8.76MiB/s]


Finished download, copying /tmp/tmpp045ylzg to cache at /root/.scispacy/datasets/21a1012c532c3a431d60895c509f5b4d45b0f8966c4178b892190a302b21836f.330707f4efe774134872b9f77f0e3208c1d30f50800b3b39a6b8ec21d9adf1b7.umls_semantic_type_tree.tsv


<scispacy.linking.EntityLinker at 0x7a86a2846e90>

## При обработке текста с помощью sciscpaCy стало очевидно, насколько значительную часть академического текста составляют термины: текст сокращался приблизительно в 4 раза, оставляя практически голый скелет из служебных слов и редких ЛЕ литературного языка. Это связано с тем, что многие так называемые общенаучные / академические ЛЕ в научных текстах приобретают особую семантику, а также входят в состав терминологических единиц, являясь "субстратом" для создания новых терминов (В.М.Лейчик). Таким образом, было решено удалять термины, оставляя при этом академические слова, даже входящие в терминологические единицы. В качестве списка академических слов был взят AVL (D.Gardner, M.Davies, 2013), состоящий из 1992 академических ЛЕММ, к которому  при помощи Wordnet были добавлены словообразовательные гнезда, что позволило увеличить список "оставляемых" слов до 4634 ЛЕ.

In [None]:
# Достаём академические слова из файла:
words = ''
with open("AVL.txt", 'r') as f:
    for line in f:
        words += line
words = re.sub(r'\s', ' ', words)
words = set(words.split(' '))
len(words)

1992

In [None]:
from nltk.corpus import wordnet as wn

# Добавляем к каждому слову списка "словарные гнезда", т. е. совокупности слов, обладающих общностью корня(например, indicate - indicating, indication и т.д.)
def add_words_derivations(words):
    forms = set()
    for word in words:
        # We'll store the derivational forms in a set to eliminate duplicates
        for happy_lemma in wn.lemmas(word): #for each "happy" lemma in WordNet
            forms.add(happy_lemma.name()) #add the lemma itself
            for related_lemma in happy_lemma.derivationally_related_forms(): #for each related lemma
                forms.add(related_lemma.name()) #add the related lemma
    return forms

nltk.download('wordnet')
words = add_words_derivations(words)
print(len(words))


[nltk_data] Downloading package wordnet to /root/nltk_data...


4634


In [None]:
def remove_biology_terms(text):
    text = re.sub(r'\s', ' ', text)

    doc = nlp(text)
    doc_no_ents = []
    ents = list(set([e.text for e in doc.ents]) - words)

    ents_one_word = [ent for ent in ents if len(ent.split(' ')) == 1]
    ents_two_words = [ent for ent in ents if len(ent.split(' ')) == 2]
    ents_three_words = [ent for ent in ents if len(ent.split(' ')) == 3]
    ents_four_words = [ent for ent in ents if len(ent.split(' ')) == 4]

    i = 0
    while i < len(doc):
        #удаление медицинских терминов, состоящих из 1, 2, 3 и 4 слов соответственно:
        if  doc[i].text in ents_one_word:
            i += 1
            continue
        if (i + 1 < len(doc)) and (doc[i].text + ' ' + doc[i + 1].text in ents_two_words):
            i += 2
            continue
        if (i + 2 < len(doc)) and (doc[i].text + ' ' + doc[i + 1].text + ' ' + doc[i + 2].text in ents_three_words):
            i += 3
            continue
        if (i + 3 < len(doc)) and (doc[i].text + ' ' + doc[i + 1].text + ' ' + doc[i + 2].text + ' ' + doc[i + 3].text in ents_four_words):
            i += 4
            continue

        #удаление стоп-слов (зависит от задачи):
        #if doc[i].text.lower() in stops:
           # i += 1
           # continue

        doc_no_ents.append(doc[i].text)
        i += 1
    return " ".join(doc_no_ents)

In [None]:
# Разбиваем тексты на предложения
def sent_split(df, col):
    sents = []
    for _, row in df.iterrows():
        text = row[col]
        for sent in nltk.sent_tokenize(text):
            sents.append(sent)
    sents = pd.DataFrame(sents).rename({0: 'text'}, axis=1)
    return sents

In [None]:
# NB! Стоит обратить внимание на последовательность действий:
def cleantext(df):
    # удаляем имена из одной буквы и точки
    df['cleaned_text'] = df['text'].replace(r'[A-Z]\.', "", regex=True)

    # приводим к нижнему регистру
    df['cleaned_text'] = df['cleaned_text'].str.lower()

    # Внутритекстовые ссылки в квадратных скобках
    df['cleaned_text'] = df['cleaned_text'].replace(r'\[[0-9 ,– -]+\]', "", regex=True) # от ссылок в квадратных скобках внутри текста

    # Удаление дефиса
    df['cleaned_text'] = df['cleaned_text'].replace(r"-\n", "", regex = True)

    # удаляем всё, кроме заглавных и строчных букв английского алфавита
    df['cleaned_text'] = df['cleaned_text'].replace(r'[^a-zA-Z0-9]', " ", regex=True)

    # Удаление doi | DOI
    df['cleaned_text'] = df['cleaned_text'].replace(r'(/doi/10[.][0-9]{4,}.*)', '', regex=True)

    # удаляем возможные ссылки URL(x):
    df['cleaned_text']  = df['cleaned_text'].replace(r'https.*', "", regex = True)

    # Удаление цифр
    df['cleaned_text'] = df['cleaned_text'].replace(r'\d+', '', regex=True)

    # Удаление двух и более подряд идущих пробелов
    df['cleaned_text'] = df['cleaned_text'].replace(r'\s{2,}', ' ', regex=True)
    return df


In [None]:
# Для работы с моделями Bert, которые принимают на вход предложения одной длины (512 токенов по умолчанию)
def clean_long_sent(df):
    df["len"] = df["cleaned_text"].apply(lambda x: len(x))
    return df[df.len <= 512]

# Биологический текст

In [None]:
bio = pd.read_csv("/content/BIOcorpus.csv")
bio

Unnamed: 0.1,Unnamed: 0,text,branch
0,0,bs_bs_banner\n\nINVITED REVIEW SERIES:\nSTEM C...,CELL BIOLOGY 18
1,1,"AIMS Bioengineering, 5(1): 1–38. \n\nDOI: 10.3...",BIOTECHNOLOGY:BIOENGINEERING 18
2,2,"Leuba et al. BMC Biophysics 2014, 7:6\nhttp://...",BIOPHYSICS 28
3,3,Hydrobiologia (2019) 836:35–47\nhttps://doi.or...,HYDROBIOLOGY 11
4,4,"Woods BMC Biophysics 2014, 7:8\nhttp://www.bio...",BIOPHYSICS 29
...,...,...,...
246,246,J. Biochem. 2019;166(1):97–106 doi:10.1093/jb/...,BIOCHEMISTRY 36
247,247,Smith et al. BMC Molecular Biol (2016) 17:22 ...,MOLECULAR BIOLOGY 2
248,248,"Protein Engineering, Design & Selection vol. 2...",BIOTECHNOLOGY:BIOENGINEERING 23
249,249,"Protein Engineering, Design & Selection vol. 2...",BIOTECHNOLOGY:BIOENGINEERING 35


## Немного разбираемся с регулярками

In [None]:
import re
re.findall(r'\[[0-9 ,– -]+\]', bio["text"].iloc[1])
# [0-9 ,– -] -- набор символов, который мы ищем
# [0-9 ,– -]+ -- любой из символов встречается хотя бы 1 раз ("abcd6 9-,1" -> "6 9-,1")
# \[[0-9 ,– -]+\] -- ищем строчки из п.2 в квадратных скобках, \ -- символ экранирования

['[1,2]',
 '[3]',
 '[4]',
 '[5]',
 '[6]',
 '[6]',
 '[7]',
 '[1]',
 '[8]',
 '[1]',
 '[6]',
 '[1]',
 '[6]',
 '[9]',
 '[10]',
 '[2]',
 '[11]',
 '[2]',
 '[10]',
 '[4]',
 '[4]',
 '[4]',
 '[12]',
 '[13]',
 '[14]',
 '[3]',
 '[15]',
 '[3]',
 '[16]',
 '[17]',
 '[18]',
 '[19]',
 '[20]',
 '[4]',
 '[21,22]',
 '[21]',
 '[23]',
 '[24]',
 '[24]',
 '[25]',
 '[5]',
 '[26]',
 '[27]',
 '[28]',
 '[29]',
 '[30]',
 '[31]',
 '[32]',
 '[5]',
 '[33]',
 '[34]',
 '[34]',
 '[5,35]',
 '[36,37]',
 '[38,39]',
 '[40,41]',
 '[30]',
 '[42]',
 '[43]',
 '[44]',
 '[45]',
 '[44,46]',
 '[47]',
 '[48]',
 '[49]',
 '[50]',
 '[51]',
 '[47,52]',
 '[47]',
 '[53,54]',
 '[47,55–57]',
 '[58,59]',
 '[60]',
 '[61,62]',
 '[61,62]',
 '[63,64]',
 '[57]',
 '[65]',
 '[66–68]',
 '[69,70]',
 '[71]',
 '[72]',
 '[73]',
 '[72]',
 '[74]',
 '[75,76]',
 '[63,64]',
 '[77]',
 '[78]',
 '[79]',
 '[80]',
 '[81]',
 '[63]',
 '[82]',
 '[82,83]',
 '[83–85]',
 '[85]',
 '[86,87]',
 '[88]',
 '[89]',
 '[88]',
 '[90]',
 '[90]',
 '[79]',
 '[91]',
 '[90]',
 '[90]

In [None]:
for i in range(len(bio)):
    if len(re.findall(r'https.*', bio['text'].iloc[i])) > 0:
        print(re.findall(r'https.*', bio['text'].iloc[i]))

# https. -- ищем подстроку "https{любой символ кроме начала строки}"
# https.* -- ищем подстроку "https{все символы кроме начала строки}"
# https.*$ -- ищем подстроку "https{все символы кроме начала строки}",  которая заканчивается символом конца строки


['https://doi.org/10.1007/s10750-019-3939-2 (0123456789().,-volV)(0123456789().,-volV)', 'https://doi.org/10.1007/s10750-019-3939-2) con-']
['https ://creativecommons .org /licenses /by -nc -sa /4 .0 /).', 'https://doi.org/10.1083/jcb.201707168', 'https ://doi .org /10 .1093 /molehr /gap092', 'https ://doi .org /10 .1093 /hmg /ddp483', 'https ://doi .org ', 'https ://doi .org /10 .1042 /BST20110609', 'https ://doi .org /10 .1016 /j .exger .2012 .09 .009', 'https ://doi ', 'https ://doi .org /10 .1126 /science .1065768', 'https ://doi .org /10 .1016 /', 'https ://doi .org /10 .1111 /j .1474 -9726 .2007 .00348 .x', 'https ://doi .org /10 .1016 /j .cell .2006 .01 .039', 'https ://doi ', 'https ://doi .org /10 .1126 /science .1078223', 'https ://doi .org /10 ', 'https ://doi .org /10 .18632 /', 'https ://doi ', 'https ://doi .org ', 'https ', 'https ://doi .org /10 .1016 /S0092 -8674(00)80595 ', 'https ://doi .org /10 .1126 /science .289 .5487 .2122', 'https ://doi .org /10 .1126 /science 

In [None]:
# re.findall(r'.[A-Z]+[a-z]+', bio["text"].iloc[0])
# # [A-Z]+[a-z]+ -- ищем подстроку вида "{какое-то кол-во заглавных букв}{какое-то кол-во строчных букв}"
# # .[A-Z]+[a-z]+ -- ищем подстроку вида "{любой символ кроме начала строки}{какое-то кол-во заглавных букв}{какое-то кол-во строчных букв}"

[' Do',
 ' Health',
 ' Research',
 ' Centre',
 ' Department',
 ' Pharmacology',
 ' Lung',
 ' Health',
 ' Research',
 ' Centre',
 ' University',
 ' Melbourne',
 ' Victoria',
 ' Australia',
 ' In',
 ' We',
 ' The',
 '‘Omnis',
 ' Virchow',
 ' Bertoncello',
 ' Lung',
 ' Health',
 ' Research',
 ' Department',
 ' Pharmacology',
 ' Lung',
 ' Health',
 ' Research',
 ' Level',
 ' Medical',
 ' Building',
 ' The',
 ' University',
 ' Melbourne',
 ' Australia',
 ' Email',
 ' Authors',
 ' Associate',
 ' Professor',
 ' Ivan',
 ' Bertoncello',
 ' Lung',
 ' Regeneration',
 ' Laboratory',
 ' Dr',
 ' Mc',
 ' Lung',
 ' Stem',
 ' Cells',
 ' Microenvi',
 ' Group',
 ' January',
 ' January',
 ' February',
 ' February',
 ' The',
 ' Authors',
 ' Asian',
 ' Paci',
 ' Society',
 ' Respirology',
 ' Rare',
 ' Relatively',
 ' This',
 ' In',
 ' The',
 ' Multipotent',
 ' Bertoncello',
 ' Mc',
 '(Omnis',
 ' The',
 ' The',
 ' Airway',
 ' However',
 ' Accordingly',
 '(Fig',
 ' As',
 ' Day',
 ' In',
 ' Day',
 ' The',
 ' A

In [None]:
re.findall(r'([article]*/doi/10[.][0-9]{4,}[^\s"/<>]*/[^\s"<>]+)', bio["text"].iloc[10])

[]

In [None]:
for i in range(len(bio)):
    if len(re.findall(r'(/doi/10[.][0-9]{4,}.*)', bio["text"].iloc[i])) > 0:
        print(re.findall(r'(/doi/10[.][0-9]{4,}.*)', bio["text"].iloc[i]))

# [0-9]{4,6} -- ищем строчку, состояющую из цифр (и только из них), длина которой -- минимум 4 и максимум 6

['/doi/10.1242/dev.02732.']
['/doi/10.1242/bio.038232.supplemental']
['/doi/10.1046/j.0305-1846.2003.00525.x/abstract.']


## Продолжаем работу над текстом

In [None]:
bio['text'] = bio['text'].astype(str)

In [None]:
bio_sents = sent_split(bio, "text")
bio_sents

Unnamed: 0,text
0,bs_bs_banner\n\nINVITED REVIEW SERIES:\nSTEM C...
1,IVAN BERTONCELLO AND JONATHAN L. MCQUALTER\n\n...
2,"In this\nreview, we describe the attributes of..."
3,We describe the power and\nlimitations of expe...
4,The review summarizes\nrecent progress and obs...
...,...
121159,"Sheth RD, Jin M, Bhut BV, et al."
121160,(2014) Affinity precipitation of a monoclonal ...
121161,Biotechnol Bioeng \n111: 1595–1603.
121162,"© 2015 Rachel Chen, et al., licensee AIMS Press."


In [None]:
clean_bio = cleantext(bio_sents)
clean_bio

Unnamed: 0,text,cleaned_text
0,bs_bs_banner\n\nINVITED REVIEW SERIES:\nSTEM C...,bs bs banner invited review series stem cells ...
1,IVAN BERTONCELLO AND JONATHAN L. MCQUALTER\n\n...,ivan bertoncello and jonathan mcqualter lung h...
2,"In this\nreview, we describe the attributes of...",in this review we describe the attributes of a...
3,We describe the power and\nlimitations of expe...,we describe the power and limitations of exper...
4,The review summarizes\nrecent progress and obs...,the review summarizes recent progress and obst...
...,...,...
121159,"Sheth RD, Jin M, Bhut BV, et al.",sheth rd jin m bhut bv et al
121160,(2014) Affinity precipitation of a monoclonal ...,affinity precipitation of a monoclonal antibo...
121161,Biotechnol Bioeng \n111: 1595–1603.,biotechnol bioeng
121162,"© 2015 Rachel Chen, et al., licensee AIMS Press.",rachel chen et al licensee aims press


In [None]:
clean_bio.cleaned_text.iloc[1] #просто привели предложения в нормальный вид и выкинули пунктуацию + "непонятные" символы

'ivan bertoncello and jonathan mcqualter lung health research centre department of pharmacology lung health research centre university of melbourne melbourne victoria australia abstract recognition of the potential of stem cell based therapies for alleviating intractable lung diseases has provided the impetus for research aimed at identifying regenerative cells in the adult lung understanding how they are organized and regulated and how they could be harnessed in lung regenerative medicine '

In [None]:
remove_biology_terms(clean_bio.iloc[1].cleaned_text) # удаляем биологические термины,
# оставляем при этом академическую лексику - слова литературного языка, которые в научном контексте не только приобретают иную семантику,
# но также являются "субстратом для создания новых терминов" (Лейчик)
# НЕ удаляем / удаляем стоп-слова

'jonathan health research centre department health research centre university victoria australia potential based alleviating intractable diseases provided impetus research aimed identifying adult understanding could regenerative medicine'

In [None]:
clean_bio["removed_terms_text"] = clean_bio['cleaned_text'].apply(remove_biology_terms)
clean_bio

Unnamed: 0,text,cleaned_text,removed_terms_text
0,bs_bs_banner\n\nINVITED REVIEW SERIES:\nSTEM C...,bs bs banner invited review series stem cells ...,bs bs banner invited review series stem cells ...
1,IVAN BERTONCELLO AND JONATHAN L. MCQUALTER\n\n...,ivan bertoncello and jonathan mcqualter lung h...,and jonathan health research centre department...
2,"In this\nreview, we describe the attributes of...",in this review we describe the attributes of a...,in this review we describe the of adult stem a...
3,We describe the power and\nlimitations of expe...,we describe the power and limitations of exper...,we describe the power and of strategies and us...
4,The review summarizes\nrecent progress and obs...,the review summarizes recent progress and obst...,the review summarizes recent progress and in d...
...,...,...,...
121159,"Sheth RD, Jin M, Bhut BV, et al.",sheth rd jin m bhut bv et al,rd bv et al
121160,(2014) Affinity precipitation of a monoclonal ...,affinity precipitation of a monoclonal antibo...,affinity precipitation of a from an harvest ...
121161,Biotechnol Bioeng \n111: 1595–1603.,biotechnol bioeng,
121162,"© 2015 Rachel Chen, et al., licensee AIMS Press.",rachel chen et al licensee aims press,chen et al aims


In [None]:
prep_bio = clean_long_sent(clean_bio)
prep_bio

Unnamed: 0,text,cleaned_text,removed_terms_text,len
0,bs_bs_banner\n\nINVITED REVIEW SERIES:\nSTEM C...,bs bs banner invited review series stem cells ...,bs bs banner invited review series stem cells ...,140
1,IVAN BERTONCELLO AND JONATHAN L. MCQUALTER\n\n...,ivan bertoncello and jonathan mcqualter lung h...,and jonathan health research centre department...,494
2,"In this\nreview, we describe the attributes of...",in this review we describe the attributes of a...,in this review we describe the of adult stem a...,191
3,We describe the power and\nlimitations of expe...,we describe the power and limitations of exper...,we describe the power and of strategies and us...,214
4,The review summarizes\nrecent progress and obs...,the review summarizes recent progress and obst...,the review summarizes recent progress and in d...,161
...,...,...,...,...
121159,"Sheth RD, Jin M, Bhut BV, et al.",sheth rd jin m bhut bv et al,rd bv et al,29
121160,(2014) Affinity precipitation of a monoclonal ...,affinity precipitation of a monoclonal antibo...,affinity precipitation of a from an harvest ...,131
121161,Biotechnol Bioeng \n111: 1595–1603.,biotechnol bioeng,,18
121162,"© 2015 Rachel Chen, et al., licensee AIMS Press.",rachel chen et al licensee aims press,chen et al aims,39


In [None]:
prep_bio.to_csv('prep_SW_BIO.csv')

# KJV BIBLE
King James Bible, 1611, канонический перевод Библии, обычно называемый Авторизованной версией (the Authorized Version).

Отказались от использования этой версии перевода Библии (несмотря на все еще лидирующую популярность King James Version), так как она содержит большое количество архаизмов, встретить которые в современных научно-полпулярных текстах мы не ожидаем.

In [None]:
import pandas as pd
import numpy as np

import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stops = stopwords.words("english")

nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
bible = pd.read_csv("/content/KJV.csv")
bible

Unnamed: 0.1,Unnamed: 0,citation,book,chapter,verse,text,book_code,testament,testament_code
0,0,Genesis 1:1,Genesis,1,1,In the beginning God created the heaven and th...,1,Old Testament,0
1,1,Genesis 1:2,Genesis,1,2,"And the earth was without form, and void; and ...",1,Old Testament,0
2,2,Genesis 1:3,Genesis,1,3,"And God said, Let there be light: and there wa...",1,Old Testament,0
3,3,Genesis 1:4,Genesis,1,4,"And God saw the light, that it was good: and G...",1,Old Testament,0
4,4,Genesis 1:5,Genesis,1,5,"And God called the light Day, and the darkness...",1,Old Testament,0
...,...,...,...,...,...,...,...,...,...
31097,31097,Revelation 22:17,Revelation,22,17,"And the Spirit and the bride say, Come. And le...",66,New Testament,1
31098,31098,Revelation 22:18,Revelation,22,18,For I testify unto every man that heareth the ...,66,New Testament,1
31099,31099,Revelation 22:19,Revelation,22,19,And if any man shall take away from the words ...,66,New Testament,1
31100,31100,Revelation 22:20,Revelation,22,20,"He which testifieth these things saith, Surely...",66,New Testament,1


### Включаем в "библейский" корпус только New Testament. В самом начале исследования была идея использовать только NT, однако в процессе мы от этого отказались, включив в "библейский" корпус оба Завета. Кроме того, решили использовать American Standard Version.

In [None]:
bible = bible[bible.testament_code == 1]
bible.head()

Unnamed: 0.1,Unnamed: 0,citation,book,chapter,verse,text,book_code,testament,testament_code
23145,23145,Matthew 1:1,Matthew,1,1,"The book of the generation of Jesus Christ, th...",40,New Testament,1
23146,23146,Matthew 1:2,Matthew,1,2,Abraham begat Isaac; and Isaac begat Jacob; an...,40,New Testament,1
23147,23147,Matthew 1:3,Matthew,1,3,And Judas begat Phares and Zara of Thamar; and...,40,New Testament,1
23148,23148,Matthew 1:4,Matthew,1,4,And Aram begat Aminadab; and Aminadab begat Na...,40,New Testament,1
23149,23149,Matthew 1:5,Matthew,1,5,And Salmon begat Booz of Rachab; and Booz bega...,40,New Testament,1


In [None]:
def sent_split(df): # просто разбили на предложения!!!
    sents = []
    for _, row in df.iterrows():
        text = row.text
        for sent in nltk.sent_tokenize(text):
            sents.append(sent)
    sents = pd.DataFrame(sents).rename({0: 'text'}, axis=1)
    return sents

In [None]:
bible_sents = sent_split(bible)

In [None]:
def clean_bible_text(df):
    # приводим к нижнему регистру
    df['cleaned_text'] = df['text'].str.lower()

    # удаляем символы
    df['cleaned_text'] = df['cleaned_text'].replace(r'[^a-zA-Z0-9]', " ", regex=True)

    # Удаление цифр
    df['cleaned_text'] = df['cleaned_text'].replace(r'\d+', '', regex=True)
    return df

In [None]:
prep_bible = clean_bible_text(bible_sents)

In [None]:
prep_bible

Unnamed: 0,text,cleaned_text
0,"The book of the generation of Jesus Christ, th...",the book of the generation of jesus christ th...
1,Abraham begat Isaac; and Isaac begat Jacob; an...,abraham begat isaac and isaac begat jacob an...
2,And Judas begat Phares and Zara of Thamar; and...,and judas begat phares and zara of thamar and...
3,And Aram begat Aminadab; and Aminadab begat Na...,and aram begat aminadab and aminadab begat na...
4,And Salmon begat Booz of Rachab; and Booz bega...,and salmon begat booz of rachab and booz bega...
...,...,...
9025,"He which testifieth these things saith, Surely...",he which testifieth these things saith surely...
9026,Amen.,amen
9027,"Even so, come, Lord Jesus.",even so come lord jesus
9028,The grace of our Lord Jesus Christ be with you...,the grace of our lord jesus christ be with you...


In [None]:
import string

def stopwords_remover(words):

  without_stopwords = ""
  for word in words.split(" "):
    if word in stops or word in string.punctuation:
      continue
    else:
      without_stopwords += word + " "
  return without_stopwords

In [None]:
prep_bible['prep_text'] = prep_bible['cleaned_text'].apply(stopwords_remover)


In [None]:
prep_bible

Unnamed: 0,text,cleaned_text,prep_text
0,"The book of the generation of Jesus Christ, th...",the book of the generation of jesus christ th...,book generation jesus christ son david son abr...
1,Abraham begat Isaac; and Isaac begat Jacob; an...,abraham begat isaac and isaac begat jacob an...,abraham begat isaac isaac begat jacob jacob be...
2,And Judas begat Phares and Zara of Thamar; and...,and judas begat phares and zara of thamar and...,judas begat phares zara thamar phares begat es...
3,And Aram begat Aminadab; and Aminadab begat Na...,and aram begat aminadab and aminadab begat na...,aram begat aminadab aminadab begat naasson naa...
4,And Salmon begat Booz of Rachab; and Booz bega...,and salmon begat booz of rachab and booz bega...,salmon begat booz rachab booz begat obed ruth ...
...,...,...,...
9025,"He which testifieth these things saith, Surely...",he which testifieth these things saith surely...,testifieth things saith surely come quickly
9026,Amen.,amen,amen
9027,"Even so, come, Lord Jesus.",even so come lord jesus,even come lord jesus
9028,The grace of our Lord Jesus Christ be with you...,the grace of our lord jesus christ be with you...,grace lord jesus christ


In [None]:
def clean_long_sent(df):
    df["len"] = df["prep_text"].apply(lambda x: len(x))
    return df[df.len <= 512]

In [None]:
prep_bible = clean_long_sent(prep_bible)
prep_bible

Unnamed: 0,text,cleaned_text,prep_text,len
0,"The book of the generation of Jesus Christ, th...",the book of the generation of jesus christ th...,book generation jesus christ son david son abr...,51
1,Abraham begat Isaac; and Isaac begat Jacob; an...,abraham begat isaac and isaac begat jacob an...,abraham begat isaac isaac begat jacob jacob be...,65
2,And Judas begat Phares and Zara of Thamar; and...,and judas begat phares and zara of thamar and...,judas begat phares zara thamar phares begat es...,67
3,And Aram begat Aminadab; and Aminadab begat Na...,and aram begat aminadab and aminadab begat na...,aram begat aminadab aminadab begat naasson naa...,64
4,And Salmon begat Booz of Rachab; and Booz bega...,and salmon begat booz of rachab and booz bega...,salmon begat booz rachab booz begat obed ruth ...,63
...,...,...,...,...
9025,"He which testifieth these things saith, Surely...",he which testifieth these things saith surely...,testifieth things saith surely come quickly,44
9026,Amen.,amen,amen,5
9027,"Even so, come, Lord Jesus.",even so come lord jesus,even come lord jesus,21
9028,The grace of our Lord Jesus Christ be with you...,the grace of our lord jesus christ be with you...,grace lord jesus christ,24


In [None]:
prep_bible[prep_bible.prep_text != 'amen ']

Unnamed: 0,text,cleaned_text,prep_text,len
0,"The book of the generation of Jesus Christ, th...",the book of the generation of jesus christ th...,book generation jesus christ son david son abr...,51
1,Abraham begat Isaac; and Isaac begat Jacob; an...,abraham begat isaac and isaac begat jacob an...,abraham begat isaac isaac begat jacob jacob be...,65
2,And Judas begat Phares and Zara of Thamar; and...,and judas begat phares and zara of thamar and...,judas begat phares zara thamar phares begat es...,67
3,And Aram begat Aminadab; and Aminadab begat Na...,and aram begat aminadab and aminadab begat na...,aram begat aminadab aminadab begat naasson naa...,64
4,And Salmon begat Booz of Rachab; and Booz bega...,and salmon begat booz of rachab and booz bega...,salmon begat booz rachab booz begat obed ruth ...,63
...,...,...,...,...
9023,For I testify unto every man that heareth the ...,for i testify unto every man that heareth the ...,testify unto every man heareth words prophecy ...,117
9024,And if any man shall take away from the words ...,and if any man shall take away from the words ...,man shall take away words book prophecy god sh...,105
9025,"He which testifieth these things saith, Surely...",he which testifieth these things saith surely...,testifieth things saith surely come quickly,44
9027,"Even so, come, Lord Jesus.",even so come lord jesus,even come lord jesus,21


In [None]:
prep_bible.to_csv('prep_KJV.csv')

# American Standard Version (asb - в нашем исследовании)
* Пересмотр the Authorised Version (King James Bible) произошел благодаря сделанным с  XVII в. многочисленным открытиям  в области древних манускриптов. Основные принципы нового перевода состояли в следующем: как можно меньше изменять текст the Authorised Version, максимально ограничить воздействие таких изменений на язык the Authorised Version и более ранних версий; адаптировать текст к современному (конец ХIX - начало ХХ вв.) состоянию языка. Обновленная редакции вышла в Англии в 1881 г. (Новый Завет) и 1885 г. (Ветхий Завет). В Америке этот же перевод появился в 1901 году под названием Американская стандартная версия (American Standard Version).

Форостенко, А В. Топология современных переводов Библии на английский язык. Дисс. ... канд. филол. наук. М., 2003.

In [1]:
import pandas as pd
import numpy as np

import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stops = stopwords.words("english")

nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
asb = pd.read_csv('/content/asb.csv')
asb

Unnamed: 0,Verse ID,Book Name,Book Number,Chapter,Verse,Text
0,1,Genesis,1,1,1,In the beginning God created the heavens and t...
1,2,Genesis,1,1,2,And the earth was waste and void; and darkness...
2,3,Genesis,1,1,3,"And God said, Let there be light: and there wa..."
3,4,Genesis,1,1,4,"And God saw the light, that it was good: and G..."
4,5,Genesis,1,1,5,"And God called the light Day, and the darkness..."
...,...,...,...,...,...,...
31097,31098,Revelation,66,22,17,"And the Spirit and the bride say, Come. And he..."
31098,31099,Revelation,66,22,18,I testify unto every man that heareth the word...
31099,31100,Revelation,66,22,19,and if any man shall take away from the words ...
31100,31101,Revelation,66,22,20,"He who testifieth these things saith, Yea: I c..."


In [6]:
asb.iloc[29]['Text']

'and to every beast of the earth, and to every bird of the heavens, and to everything that creepeth upon the earth, wherein there is life, [I have given] every green herb for food: and it was so.'

In [8]:
def sent_split(df, col):
    sents = []
    for _, row in df.iterrows():
        text = row[col]
        for sent in nltk.sent_tokenize(text):
            sents.append(sent)
    sents = pd.DataFrame(sents).rename({0: 'Text'}, axis=1)
    return sents

In [10]:
asb['Text'] = asb['Text'].astype(str)

In [11]:
asb_sents = sent_split(asb, "Text")

In [12]:
def clean_bible_text(df):
    # приводим к нижнему регистру
    df['cleaned_text'] = df['Text'].str.lower()

    # удаляем символы
    df['cleaned_text'] = df['cleaned_text'].replace(r'[^a-zA-Z0-9]', " ", regex=True)

    # Удаление цифр
    df['cleaned_text'] = df['cleaned_text'].replace(r'\d+', '', regex=True)
    return df

In [13]:
prep_bible = clean_bible_text(asb_sents)

In [14]:
prep_bible

Unnamed: 0,Text,cleaned_text
0,In the beginning God created the heavens and t...,in the beginning god created the heavens and t...
1,And the earth was waste and void; and darkness...,and the earth was waste and void and darkness...
2,"And God said, Let there be light: and there wa...",and god said let there be light and there wa...
3,"And God saw the light, that it was good: and G...",and god saw the light that it was good and g...
4,"And God called the light Day, and the darkness...",and god called the light day and the darkness...
...,...,...
35345,and if any man shall take away from the words ...,and if any man shall take away from the words ...
35346,"He who testifieth these things saith, Yea: I c...",he who testifieth these things saith yea i c...
35347,"Amen: come, Lord Jesus.",amen come lord jesus
35348,The grace of the Lord Jesus be with the saints.,the grace of the lord jesus be with the saints


In [None]:
# решили не удалять стоп-слова
"""
import string

def stopwords_remover(words):

  without_stopwords = ""
  for word in words.split(" "):
    if word in stops or word in string.punctuation:
      continue
    else:
      without_stopwords += word + " "
  return without_stopwords
  """

In [None]:
# prep_bible['prep_text'] = prep_bible['cleaned_text'].apply(stopwords_remover)


In [15]:
prep_bible['prep_text'] = prep_bible['cleaned_text']

In [16]:
prep_bible

Unnamed: 0,Text,cleaned_text,prep_text
0,In the beginning God created the heavens and t...,in the beginning god created the heavens and t...,in the beginning god created the heavens and t...
1,And the earth was waste and void; and darkness...,and the earth was waste and void and darkness...,and the earth was waste and void and darkness...
2,"And God said, Let there be light: and there wa...",and god said let there be light and there wa...,and god said let there be light and there wa...
3,"And God saw the light, that it was good: and G...",and god saw the light that it was good and g...,and god saw the light that it was good and g...
4,"And God called the light Day, and the darkness...",and god called the light day and the darkness...,and god called the light day and the darkness...
...,...,...,...
35345,and if any man shall take away from the words ...,and if any man shall take away from the words ...,and if any man shall take away from the words ...
35346,"He who testifieth these things saith, Yea: I c...",he who testifieth these things saith yea i c...,he who testifieth these things saith yea i c...
35347,"Amen: come, Lord Jesus.",amen come lord jesus,amen come lord jesus
35348,The grace of the Lord Jesus be with the saints.,the grace of the lord jesus be with the saints,the grace of the lord jesus be with the saints


In [17]:
def clean_long_sent(df):
    df["len"] = df["prep_text"].apply(lambda x: len(x))
    return df[df.len <= 512]

In [18]:
prep_bible = clean_long_sent(prep_bible)
prep_bible

Unnamed: 0,Text,cleaned_text,prep_text,len
0,In the beginning God created the heavens and t...,in the beginning god created the heavens and t...,in the beginning god created the heavens and t...,55
1,And the earth was waste and void; and darkness...,and the earth was waste and void and darkness...,and the earth was waste and void and darkness...,134
2,"And God said, Let there be light: and there wa...",and god said let there be light and there wa...,and god said let there be light and there wa...,54
3,"And God saw the light, that it was good: and G...",and god saw the light that it was good and g...,and god saw the light that it was good and g...,85
4,"And God called the light Day, and the darkness...",and god called the light day and the darkness...,and god called the light day and the darkness...,63
...,...,...,...,...
35345,and if any man shall take away from the words ...,and if any man shall take away from the words ...,and if any man shall take away from the words ...,185
35346,"He who testifieth these things saith, Yea: I c...",he who testifieth these things saith yea i c...,he who testifieth these things saith yea i c...,58
35347,"Amen: come, Lord Jesus.",amen come lord jesus,amen come lord jesus,23
35348,The grace of the Lord Jesus be with the saints.,the grace of the lord jesus be with the saints,the grace of the lord jesus be with the saints,47


In [None]:
prep_bible[prep_bible.prep_text != 'amen ']

Unnamed: 0,Text,cleaned_text,prep_text,len
0,"The book of the genealogy of Jesus Christ, the...",the book of the genealogy of jesus christ the...,book genealogy jesus christ son david son abra...,50
1,Abraham became the father of Isaac.,abraham became the father of isaac,abraham became father isaac,28
2,Isaac became the father of Jacob.,isaac became the father of jacob,isaac became father jacob,26
3,Jacob became the father of Judah and his broth...,jacob became the father of judah and his broth...,jacob became father judah brothers,35
4,Judah became the father of Perez and Zerah by ...,judah became the father of perez and zerah by ...,judah became father perez zerah tamar,38
...,...,...,...,...
10590,I testify to everyone who hears the words of t...,i testify to everyone who hears the words of t...,testify everyone hears words prophecy book any...,88
10591,If anyone takes away from the words of the boo...,if anyone takes away from the words of the boo...,anyone takes away words book prophecy may god ...,94
10592,"He who testifies these things says, ""Yes, I co...",he who testifies these things says yes i co...,testifies things says yes come quickly,39
10594,"Yes, come, Lord Jesus.",yes come lord jesus,yes come lord jesus,20


In [19]:
prep_bible.to_csv('prep_asb_SW.csv') # American Standard Bible with stop-words

# Корпус статей из журнала "The Economist"

In [20]:
df = pd.read_csv("/content/bio_Economist.csv")
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,title,date,link
0,0,0,FOR THE past four billion years or so the only...,The promise and perils of synthetic biology,Apr 4th 2019,https://www.economist.com/leaders/2019/04/04/t...
1,1,1,IN A former leatherworks just off Euston Road ...,Will artificial intelligence help to crack bio...,Jan 7th 2017,https://www.economist.com/science-and-technolo...
2,2,2,“How many cells are there in a human being?” I...,The idea of “holobionts” represents a paradigm...,Jun 14th 2023,https://www.economist.com/science-and-technolo...
3,3,3,LIVING creatures are jolly useful. Farmers rea...,The remarkable promise of cell-free biology,May 4th 2017,https://www.economist.com/leaders/2017/05/04/t...
4,4,4,"A broken brain, hidden inside a skull, is hard...",Better brain biology will deliver better medic...,Sep 21st 2022,https://www.economist.com/technology-quarterly...
...,...,...,...,...,...,...
957,957,957,Science can be a little scary. Its potential t...,Towards immortality,Nov 16th 2006,https://www.economist.com/news/2006/11/16/towa...
958,958,958,IT MAY challenge the spirit to think of caulif...,The king of cauliflowers,Feb 12th 2015,https://www.economist.com/middle-east-and-afri...
959,959,959,"MAJOR DEPRESSION is a serious illness, but als...",A blood test may help the diagnosis and treatm...,May 6th 2021,https://www.economist.com/science-and-technolo...
960,960,960,Holt StudiosA beautiful crop of plasticAt the ...,Meet industrial biotech,Nov 20th 2003,https://www.economist.com/news/2003/11/20/meet...


In [21]:
df_sents = sent_split(df, 'text')

In [22]:
df_sents

Unnamed: 0,Text
0,FOR THE past four billion years or so the only...
1,Sometimes the gene would be damaged or scrambl...
2,From that raw material arose the glories of na...
3,"But beneath it all, gene begat gene.That is no..."
4,Now genes can be written from scratch and edit...
...,...
35972,"He found that, after 15 days, parasite loads i..."
35973,Dr Kumar plans to try—though he will use genet...
35974,"But many, probably most, doctors are suspiciou..."
35975,"Dr Kumar’s findings are, nevertheless, interes..."


In [23]:
def cleantext(df):
    # удаляем имена из одной буквы и точки
    df['cleaned_text'] = df['Text'].replace(r'[A-Z]\.', "", regex=True)
    # приводим к нижнему регистру
    df['cleaned_text'] = df['cleaned_text'].str.lower()

    # Внутритекстовые ссылки в квадратных скобках
    df['cleaned_text'] = df['cleaned_text'].replace(r'\[[0-9 ,– -]+\]', "", regex=True) # от ссылок в квадратных скобках внутри текста

    # Удаление дефиса
    df['cleaned_text'] = df['cleaned_text'].replace(r"-\n", "", regex = True)

    # удаляем всё, кроме заглавных и строчных букв английского алфавита
    df['cleaned_text'] = df['cleaned_text'].replace(r'[^a-zA-Z0-9]', " ", regex=True)

    # Удаление doi | DOI
    df['cleaned_text'] = df['cleaned_text'].replace(r'(/doi/10[.][0-9]{4,}.*)', '', regex=True)

    # удаляем возможные ссылки URL(x):
    df['cleaned_text']  = df['cleaned_text'].replace(r'https.*', "", regex = True)

    # Удаление цифр
    df['cleaned_text'] = df['cleaned_text'].replace(r'\d+', '', regex=True)

    # Удаление двух и более подряд идущих пробелов
    df['cleaned_text'] = df['cleaned_text'].replace(r'\s{2,}', ' ', regex=True)
    return df


In [24]:
cleaned_df = cleantext(df_sents)

In [26]:
cleaned_df

Unnamed: 0,Text,cleaned_text
0,FOR THE past four billion years or so the only...,for the past four billion years or so the only...
1,Sometimes the gene would be damaged or scrambl...,sometimes the gene would be damaged or scrambl...
2,From that raw material arose the glories of na...,from that raw material arose the glories of na...
3,"But beneath it all, gene begat gene.That is no...",but beneath it all gene begat gene that is no ...
4,Now genes can be written from scratch and edit...,now genes can be written from scratch and edit...
...,...,...
35972,"He found that, after 15 days, parasite loads i...",he found that after days parasite loads in ani...
35973,Dr Kumar plans to try—though he will use genet...,dr kumar plans to try though he will use genet...
35974,"But many, probably most, doctors are suspiciou...",but many probably most doctors are suspicious ...
35975,"Dr Kumar’s findings are, nevertheless, interes...",dr kumar s findings are nevertheless interesting


In [27]:
def clean_long_sent(df):
    df["len"] = df["cleaned_text"].apply(lambda x: len(x))
    return df[df.len <= 512]

cleaned_df = clean_long_sent(cleaned_df)

In [28]:
cleaned_df

Unnamed: 0,Text,cleaned_text,len
0,FOR THE past four billion years or so the only...,for the past four billion years or so the only...,154
1,Sometimes the gene would be damaged or scrambl...,sometimes the gene would be damaged or scrambl...,96
2,From that raw material arose the glories of na...,from that raw material arose the glories of na...,62
3,"But beneath it all, gene begat gene.That is no...",but beneath it all gene begat gene that is no ...,58
4,Now genes can be written from scratch and edit...,now genes can be written from scratch and edit...,90
...,...,...,...
35972,"He found that, after 15 days, parasite loads i...",he found that after days parasite loads in ani...,269
35973,Dr Kumar plans to try—though he will use genet...,dr kumar plans to try though he will use genet...,137
35974,"But many, probably most, doctors are suspiciou...",but many probably most doctors are suspicious ...,165
35975,"Dr Kumar’s findings are, nevertheless, interes...",dr kumar s findings are nevertheless interesting,49


In [None]:
"""
import string

def stopwords_remover(words):

  without_stopwords = ""
  for word in words.split(" "):
    if word in stops or word in string.punctuation:
      continue
    else:
      without_stopwords += word + " "
  return without_stopwords
"""

In [29]:
cleaned_df.loc[:, 'cleaned_text'] = cleaned_df['cleaned_text'].astype(str)

In [None]:
#cleaned_df.loc[:, 'cleaned_text'] = cleaned_df['cleaned_text'].apply(stopwords_remover)


In [30]:
cleaned_df

Unnamed: 0,Text,cleaned_text,len
0,FOR THE past four billion years or so the only...,for the past four billion years or so the only...,154
1,Sometimes the gene would be damaged or scrambl...,sometimes the gene would be damaged or scrambl...,96
2,From that raw material arose the glories of na...,from that raw material arose the glories of na...,62
3,"But beneath it all, gene begat gene.That is no...",but beneath it all gene begat gene that is no ...,58
4,Now genes can be written from scratch and edit...,now genes can be written from scratch and edit...,90
...,...,...,...
35972,"He found that, after 15 days, parasite loads i...",he found that after days parasite loads in ani...,269
35973,Dr Kumar plans to try—though he will use genet...,dr kumar plans to try though he will use genet...,137
35974,"But many, probably most, doctors are suspiciou...",but many probably most doctors are suspicious ...,165
35975,"Dr Kumar’s findings are, nevertheless, interes...",dr kumar s findings are nevertheless interesting,49


In [31]:
cleaned_df.to_csv('prep_bio_Economist_SW.csv')