In [1]:
import pandas as pd
import numpy as np
import re

# Для обработки текста
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)
from stop_words import get_stop_words

# Для векторизации
from sklearn.feature_extraction.text import TfidfVectorizer

# Для моделирования
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Инициализация компонентов natasha
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

In [3]:
df = pd.read_csv('train.csv')

In [3]:
# Функция предобработки текста
def preprocess_text(text):
    russian_stopwords = set(get_stop_words("russian"))
    
    # Создаем документ natasha
    doc = Doc(text.lower())
    
    # Токенизация
    doc.segment(segmenter)
    
    # Морфологический анализ
    doc.tag_morph(morph_tagger)
    
    tokens = []
    for token in doc.tokens:
        if token.text.isalpha():
            token.lemmatize(morph_vocab)
            lemma = token.lemma
            if lemma not in russian_stopwords:
                tokens.append(lemma)
    
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [5]:
df['clean_text'] = df['text'].apply(preprocess_text)

In [6]:
df.to_csv('train_prepared.csv')

In [4]:
df = pd.read_csv('train_prepared.csv')

In [5]:
X = df['clean_text']
y = df['target']

In [6]:
# Векторизация с использованием TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

In [7]:
# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [8]:
# Обучение линейной модели - Логистическая регрессия
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print(f'Точность Логистической регрессии: {accuracy_logistic:.4f}')

Точность Логистической регрессии: 0.9285


In [8]:
# Обучение CatBoost классификатора
catboost_model = CatBoostClassifier(learning_rate=0.1, iterations=5000, task_type="GPU", devices='0')
catboost_model.fit(X_train, y_train)
y_pred_catboost = catboost_model.predict(X_test)
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
print(f'Точность CatBoost классификатора: {accuracy_catboost:.4f}')

0:	learn: 1.9575907	total: 571ms	remaining: 47m 33s
1:	learn: 1.8202526	total: 1.02s	remaining: 42m 29s
2:	learn: 1.7288745	total: 1.51s	remaining: 42m 1s
3:	learn: 1.6498442	total: 1.99s	remaining: 41m 22s
4:	learn: 1.5840649	total: 2.54s	remaining: 42m 15s
5:	learn: 1.5288670	total: 2.97s	remaining: 41m 15s
6:	learn: 1.4772764	total: 3.42s	remaining: 40m 41s
7:	learn: 1.4338200	total: 3.92s	remaining: 40m 44s
8:	learn: 1.3973025	total: 4.38s	remaining: 40m 32s
9:	learn: 1.3575582	total: 4.93s	remaining: 40m 59s
10:	learn: 1.3265578	total: 5.39s	remaining: 40m 46s
11:	learn: 1.2990587	total: 5.8s	remaining: 40m 11s
12:	learn: 1.2714449	total: 6.26s	remaining: 40m 1s
13:	learn: 1.2448279	total: 6.72s	remaining: 39m 52s
14:	learn: 1.2206282	total: 7.14s	remaining: 39m 33s
15:	learn: 1.1986738	total: 7.63s	remaining: 39m 37s
16:	learn: 1.1808646	total: 8.04s	remaining: 39m 17s
17:	learn: 1.1595088	total: 8.6s	remaining: 39m 39s
18:	learn: 1.1429414	total: 8.99s	remaining: 39m 17s
19:	lea

In [None]:
# Эксперимент с биграммами в TF-IDF
vectorizer_bi = TfidfVectorizer(ngram_range=(1,2))
X_tfidf_bi = vectorizer_bi.fit_transform(X)
X_train_bi, X_test_bi, y_train_bi, y_test_bi = train_test_split(X_tfidf_bi, y, test_size=0.2, random_state=42)

In [9]:
logistic_model_bi = LogisticRegression(max_iter=1000)
logistic_model_bi.fit(X_train_bi, y_train_bi)
y_pred_logistic_bi = logistic_model_bi.predict(X_test_bi)
accuracy_logistic_bi = accuracy_score(y_test_bi, y_pred_logistic_bi)
print(f'Точность Логистической регрессии с биграммами: {accuracy_logistic_bi:.4f}')

Точность Логистической регрессии с биграммами: 0.9224


Тестирование

In [9]:
df_test = pd.read_csv('test_news.csv')

In [10]:
df_test['clean_text'] = df_test['content'].apply(preprocess_text)

In [11]:
X_kaggle_test = vectorizer.transform(df_test['clean_text'])

y_pred_logistic = logistic_model.predict(X_kaggle_test)
# y_pred_catboost = catboost_model.predict(X_kaggle_test)

In [13]:
pd.DataFrame({'topic': y_pred_logistic}).reset_index().to_csv('logistic_new_preprocess_answer.csv', index=False)

Уровень 0.749 по скору нуууу такое, скорее всего проблема в данных, но ща попробуем еще другие подходы