<a href="https://colab.research.google.com/github/Vladislav-GitHub/DL-and-NLP-ITMO-course/blob/hw_1/NLP_1_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Загрузка данных

In [None]:
!pip install corus
!pip install razdel
!pip install contractions
!pip install num2words
!python3 -m spacy download ru_core_news_sm
!pip install natasha
!pip install pymystem3
!wget https://dumps.wikimedia.org/ruwiki/latest/ruwiki-latest-pages-articles.xml.bz2

In [3]:
import re
import nltk
import contractions
import numpy as np
import unicodedata as ucd
import spacy
from corus import load_wiki
from nltk.corpus import stopwords
from num2words import num2words
from razdel import tokenize, sentenize
from pymystem3 import Mystem
from nltk.stem import SnowballStemmer
from nltk.tokenize import sent_tokenize
from spacy import displacy
from gensim.utils import tokenize
from nltk import word_tokenize
from natasha import Doc, Segmenter, NewsEmbedding, NewsMorphTagger

In [None]:
np.random.seed(42)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
stop_words = set(stopwords.words("russian"))

In [None]:
path = '/content/ruwiki-latest-pages-articles.xml.bz2'
records = load_wiki(path)
next(records)

In [None]:
dataset = [next(records).text.lower() for _ in range(1000)]
print(dataset[0])

# Предложения
sentences = [_.text for _ in list(sentenize(dataset[0]))]
print(sentences[0])

#Удаление стоп-слов

In [7]:
def remove_stopwords(words):
    """
    Удаление стоп-слов из списка токенизированных слов
    """
    return [word for word in words if word not in stop_words]

#Нормализация

In [None]:
def replace_contractions(text):
    """
    Заменить сокращения в тексте
    """
    return contractions.fix(text)

sample = replace_contractions(dataset[0])
print(sample)
words = nltk.word_tokenize(sample, language="russian")
print(words)

In [None]:
def remove_non_ASCII(words):
    """
    Удаление символов, не являющихся символами ASCII, из списка токенизированных слов
    """
    return [ucd.normalize('NFKD', word) for word in words]

def to_lowercase(words):
    """
    Преобразование всех символов в строчные из списка токенизированных слов
    """
    return [word.lower() for word in words]

def remove_punctuation(words):
    """
    Удаление знаков препинания из списка токенизированных слов
    """
    return [re.sub(r'[^\w\s]', '', word) for word in words if word != '']

def replace_numbers(words):
    """
    Заменить все числа в списке токенизированных слов текстовым представлением
    """
    return [num2words(word, lang='ru') if word.isdigit() else word for word in words]

def remove_eng_words(words):
    """
    Удаление английских символов, слов и небольшая очистка
    """
    substitute = []
    new_words = []
    substitute_1 = re.sub(u"[a-zA-Z]", " ", ' '.join(words))
    substitute_2 = re.sub(r"(\d)+.", "\1", substitute_1)
    substitute_3 = re.sub(r"[\s]\s+", "", substitute_2)
    new_words = re.sub(r"", "", substitute_3)
    return new_words

def normalize(words):
    """
    Нормализация слов
    """
    words = remove_non_ASCII(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    words = remove_eng_words(words)
    return words

words = nltk.word_tokenize(sample, language="russian")
words = normalize(words)
print(words)

#Лемматизация/Стемминг

In [None]:
def stem_words(words):
    """
    Стемминг в списке токенизированных слов
    """
    stemmer = SnowballStemmer(language="russian")
    stems = []

    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

last_words = nltk.word_tokenize(words, language="russian")
stems = stem_words(last_words)
print('Исходные лексемы:\n', stems)

In [None]:
sentences = sent_tokenize(words, language='russian')
nlp = spacy.load("ru_core_news_sm")
doc = nlp(words)
cnt = 0

for token in doc:
    if cnt == 10:
        break
    print(token.text, token.dep_, token.head)
    cnt += 1

In [17]:
displacy.render(doc[2:13], style='dep', jupyter=True)

In [None]:
# gensim токены
gen_tokens = [word for word in tokenize(words)]
print(f'gensim: {gen_tokens[25:35]}')

# Использование Razdel + Natasha
doc = Doc(words)
segmenter = Segmenter()
doc.segment(segmenter)
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
doc.tag_morph(morph_tagger)
print(f'\nRazdel + Natasha: {doc.tokens[:5]}')
print(doc.sents[:5])

# Использование библиотеки pymystem3
m = Mystem()
lemmas = m.lemmatize(words)
print(f'\npymystem3: {lemmas[5:15]}')
print(m.analyze(words)[5:10])