In [1]:
import gzip
import re

from dataclasses import dataclass 
from typing import Iterator

import nltk
import numpy as np
import pandas as pd
import pymorphy3

from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

### 1.

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sergei\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sergei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
@dataclass
class Text:
    label: str
    title: str
    text: str

def read_texts(fn: str) -> Iterator[Text]:
    with gzip.open(fn, 'rt', encoding='utf-8') as f:
        for line in f:
            yield Text(*line.strip().split('\t'))

news = pd.DataFrame(read_texts('data/news.txt.gz'))

In [4]:
russian_letters = set('абвгдеёжзийклмнопрстуфхцчшщъыьэюя')
stop_words = stopwords.words('russian')
morph = pymorphy3.MorphAnalyzer()

def only_russian_letters(word):
    for symbol in word:
        if symbol not in russian_letters:
            return False
    return True

def tokenize_text(text):
    words = word_tokenize(text.lower(), language='russian')
    words = [re.sub(r'[^\w\s]', ' ', word) for word in words]
    words = list(filter(only_russian_letters, words))
    words = list(filter(lambda word: len(word) != 0, words))
    words = [morph.parse(word)[0].normal_form for word in words if word not in stop_words]
    return words

In [5]:
text_tokens = [tokenize_text(text) for text in news['text']]

In [6]:
w2v = Word2Vec(text_tokens)

In [7]:
w2v.wv.most_similar('мужчина')

[('женщина', 0.926418662071228),
 ('двое', 0.9001068472862244),
 ('девочка', 0.8997275829315186),
 ('преступник', 0.8876768946647644),
 ('полицейский', 0.8742073178291321),
 ('больница', 0.8724750876426697),
 ('улица', 0.8644284009933472),
 ('квартира', 0.8622693419456482),
 ('врач', 0.8492453098297119),
 ('животное', 0.849097728729248)]

### 2.

In [8]:
def get_text_tokens_embedding(tokens):
    embedding = np.zeros(100)
    count = 0

    for token in tokens:
        if token in w2v.wv.key_to_index.keys():
            embedding += w2v.wv.get_vector(token)
            count += 1
    
    if count == 0:
        count = 1

    return embedding / count

In [9]:
text_embeddings = [get_text_tokens_embedding(tokens) for tokens in text_tokens]

In [10]:
X = text_embeddings

topics = {'science': 0, 'style': 1, 'culture': 2, 'life': 3, 'economics': 4, 'business': 5, 'travel': 6, 'forces': 7, 'media': 8, 'sport': 9}
y = np.array([topics[label] for label in news['label']])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### 3.

In [11]:
clf = svm.SVC(probability=True)
clf.fit(X_train, y_train)

In [12]:
roc_auc_score(y_test, clf.predict_proba(X_test), multi_class='ovr')

0.9726308276830743

### 4.

Оценим степень важности каждого слова для каждого документа при помощи TF-IDF. Выберем только 25 самых значимых слов для текста и на их основе будем вычислять ветор соответствующий тексту.

In [13]:
vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform([' '.join(tokens) for tokens in text_tokens])
vocabluary = vectorizer.get_feature_names_out()

In [14]:
new_text_embeddings = [get_text_tokens_embedding(vocabluary[row.indices[np.argsort(row.data)[:-26:-1]]]) for row in tf_idf]

In [15]:
X = new_text_embeddings
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [16]:
clf = svm.SVC(probability=True)
clf.fit(X_train, y_train)

In [17]:
roc_auc_score(y_test, clf.predict_proba(X_test), multi_class='ovr')

0.9605483115716342