In [42]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import FastText
import re
import nltk

# Для обработки текста
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)
from stop_words import get_stop_words

# Для векторизации
from sklearn.feature_extraction.text import TfidfVectorizer

# Для моделирования
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [5]:
# Инициализация компонентов natasha
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

In [6]:
nltk.download('punkt')
nltk.download('stopwords')
russian_stopwords = set(nltk.corpus.stopwords.words("russian"))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bende\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bende\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Функция предобработки текста
def preprocess_text_to_tokens(text):
    russian_stopwords = set(get_stop_words("russian"))
    
    # Создаем документ natasha
    doc = Doc(text.lower())
    
    # Токенизация
    doc.segment(segmenter)
    
    # Морфологический анализ
    doc.tag_morph(morph_tagger)
    
    tokens = []
    for token in doc.tokens:
        if token.text.isalpha():
            token.lemmatize(morph_vocab)
            lemma = token.lemma
            if lemma not in russian_stopwords:
                tokens.append(token.text)
    return tokens

In [18]:
df = pd.read_csv('train.csv')

In [20]:
df['tokens'] = df['text'].apply(preprocess_text_to_tokens)

In [22]:
df.to_csv('train_prepared_fasttext.csv')

In [23]:
df = pd.read_csv('train_prepared_fasttext.csv')

In [24]:
all_tokens = df['tokens'].tolist()

In [25]:
# Задаем параметры модели
fasttext_model = FastText(
    vector_size=100,    # размерность векторов
    window=5,           # размер окна
    min_count=5,        # минимальная частота слова для включения в словарь
    workers=4,          # количество потоков
    sg=1                # используем skip-gram (sg=1), для cbow используйте sg=0
)

# Строим словарь
fasttext_model.build_vocab(all_tokens)

# Обучаем модель
fasttext_model.train(
    corpus_iterable=all_tokens,
    total_examples=len(all_tokens),
    epochs=10
)

(126765745, 625009070)

In [26]:
def get_text_embedding(tokens, model):
    embeddings = []
    for token in tokens:
        if token in model.wv:
            embeddings.append(model.wv[token])
    if embeddings:
        text_embedding = np.mean(embeddings, axis=0)
    else:
        # Если текст пустой или не содержит известных слов
        text_embedding = np.zeros(model.vector_size)
    return text_embedding

In [27]:
df['embedding'] = df['tokens'].apply(lambda tokens: get_text_embedding(tokens, fasttext_model))

In [28]:
# Преобразуем список эмбеддингов в массив numpy
X = np.array(df['embedding'].tolist())
y = df['target'].values

# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [29]:
# Обучение линейной модели - Логистическая регрессия
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print(f'Точность Логистической регрессии: {accuracy_logistic:.4f}')

Точность Логистической регрессии: 0.2841


In [43]:
eval_dataset = Pool(X_test,
                    y_test)

In [46]:
# Обучение CatBoost классификатора
catboost_model = CatBoostClassifier(learning_rate=0.01, iterations=10000, task_type="GPU", devices='0')
catboost_model.fit(X_train, y_train, eval_set=eval_dataset)
y_pred_catboost = catboost_model.predict(X_test)
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
print(f'Точность CatBoost классификатора: {accuracy_catboost:.4f}')

0:	learn: 2.1862338	test: 2.1863348	best: 2.1863348 (0)	total: 9.41ms	remaining: 1m 34s
1:	learn: 2.1752994	test: 2.1755156	best: 2.1755156 (1)	total: 17.8ms	remaining: 1m 29s
2:	learn: 2.1648315	test: 2.1651138	best: 2.1651138 (2)	total: 26.1ms	remaining: 1m 27s
3:	learn: 2.1548245	test: 2.1552202	best: 2.1552202 (3)	total: 34.7ms	remaining: 1m 26s
4:	learn: 2.1451813	test: 2.1456766	best: 2.1456766 (4)	total: 43.3ms	remaining: 1m 26s
5:	learn: 2.1358406	test: 2.1364567	best: 2.1364567 (5)	total: 54.5ms	remaining: 1m 30s
6:	learn: 2.1266263	test: 2.1272931	best: 2.1272931 (6)	total: 62.8ms	remaining: 1m 29s
7:	learn: 2.1175880	test: 2.1183708	best: 2.1183708 (7)	total: 71ms	remaining: 1m 28s
8:	learn: 2.1089582	test: 2.1098434	best: 2.1098434 (8)	total: 79.2ms	remaining: 1m 27s
9:	learn: 2.1005743	test: 2.1014829	best: 2.1014829 (9)	total: 87.5ms	remaining: 1m 27s
10:	learn: 2.0923247	test: 2.0933658	best: 2.0933658 (10)	total: 96.1ms	remaining: 1m 27s
11:	learn: 2.0843226	test: 2.085

Крч полная фигня))