## Парсинг как в примере бейзлайна

In [1]:
# Импорт библиотек
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from time import sleep

In [26]:
class lentaRu_parser:
    def __init__(self):
        pass

    def _get_url(self, param_dict: dict) -> str:
        """
        Возвращает URL для запроса json таблицы со статьями

        url = 'https://lenta.ru/search/v2/process?'\
        + 'from=0&'\                       # Смещение
        + 'size=1000&'\                    # Кол-во статей
        + 'sort=2&'\                       # Сортировка по дате (2), по релевантности (1)
        + 'title_only=0&'\                 # Точная фраза в заголовке
        + 'domain=1&'\                     # ??
        + 'modified%2Cformat=yyyy-MM-dd&'\ # Формат даты
        + 'type=1&'\                       # Материалы. Все материалы (0). Новость (1)
        + 'bloc=4&'\                       # Рубрика. Экономика (4). Все рубрики (0)
        + 'modified%2Cfrom=2020-01-01&'\
        + 'modified%2Cto=2020-11-01&'\
        + 'query='                         # Поисковой запрос
        """
        hasType = int(param_dict['type']) != 0
        hasBloc = int(param_dict['bloc']) != 0

        url = 'https://lenta.ru/search/v2/process?'\
        + 'from={}&'.format(param_dict['from'])\
        + 'size={}&'.format(param_dict['size'])\
        + 'sort={}&'.format(param_dict['sort'])\
        + 'title_only={}&'.format(param_dict['title_only'])\
        + 'domain={}&'.format(param_dict['domain'])\
        + 'modified%2Cformat=yyyy-MM-dd&'\
        + 'type={}&'.format(param_dict['type']) * hasType\
        + 'bloc={}&'.format(param_dict['bloc']) * hasBloc\
        + 'modified%2Cfrom={}&'.format(param_dict['dateFrom'])\
        + 'modified%2Cto={}&'.format(param_dict['dateTo'])\
        + 'query={}'.format(param_dict['query'])

        return url


    def _get_search_table(self, param_dict: dict) -> pd.DataFrame:
        """
        Возвращает pd.DataFrame со списком статей
        """
        url = self._get_url(param_dict)
        with rq.get(url) as r:
         search_table = pd.DataFrame(r.json()['matches'])
        sleep(1)
        return search_table


    def get_articles(self,
                     param_dict,
                     time_step = 37,
                     save_every = 5,
                     save_excel = True) -> pd.DataFrame:
        """
        Функция для скачивания статей интервалами через каждые time_step дней
        Делает сохранение таблицы через каждые save_every * time_step дней

        param_dict: dict
        ### Параметры запроса
        ###### project - раздел поиска, например, rbcnews
        ###### category - категория поиска, например, TopRbcRu_economics
        ###### dateFrom - с даты
        ###### dateTo - по дату
        ###### offset - смещение поисковой выдачи
        ###### limit - лимит статей, максимум 100
        ###### query - поисковой запрос (ключевое слово), например, РБК

        """
        param_copy = param_dict.copy()
        time_step = timedelta(days=time_step)
        dateFrom = datetime.strptime(param_copy['dateFrom'], '%Y-%m-%d')
        dateTo = datetime.strptime(param_copy['dateTo'], '%Y-%m-%d')
        if dateFrom > dateTo:
            raise ValueError('dateFrom should be less than dateTo')

        out = pd.DataFrame()
        save_counter = 0
        
        out = pd.read_csv('tmp/checkpoint_table.csv')

        while dateFrom <= dateTo:
            param_copy['dateTo'] = (dateFrom + time_step).strftime('%Y-%m-%d')
            if dateFrom + time_step > dateTo:
                param_copy['dateTo'] = dateTo.strftime('%Y-%m-%d')
            print('Parsing articles from '\
                  + param_copy['dateFrom'] +  ' to ' + param_copy['dateTo'])
            out = pd.concat([out, pd.DataFrame(self._get_search_table(param_copy))], ignore_index=True)
            dateFrom += time_step + timedelta(days=1)
            param_copy['dateFrom'] = dateFrom.strftime('%Y-%m-%d')
            save_counter += 1
            if save_counter == save_every:
                # display.clear_output(wait=True)
                out.to_csv("tmp/checkpoint_table.csv")
                print('Checkpoint saved!')
                save_counter = 0

        if save_excel:
            out.to_csv("lenta_{}_{}.csv".format(
                param_dict['dateFrom'],
                param_dict['dateTo']))
        print('Finish')

        return out

  """


In [31]:
# Задаем тут параметры
query = ''
offset = 0
size = 700
sort = "3"
title_only = "0"
domain = "1"
material = "0"
bloc = "0" # topic = тематика новости
dateFrom = '2024-09-15'
dateTo = "2024-12-15"

param_dict = {'query'     : query,
              'from'      : str(offset),
              'size'      : str(size),
              'dateFrom'  : dateFrom,
              'dateTo'    : dateTo,
              'sort'      : sort,
              'title_only': title_only,
              'type'      : material,
              'bloc'      : bloc,
              'domain'    : domain}

print("param_dict:", param_dict)

param_dict: {'query': '', 'from': '0', 'size': '700', 'dateFrom': '2024-09-15', 'dateTo': '2024-12-15', 'sort': '3', 'title_only': '0', 'type': '0', 'bloc': '0', 'domain': '1'}


In [33]:
# Тоже будем собирать итеративно, правда можно ставить time_step побольше, т.к.
# больше лимит на запрос статей. И Работает быстрее :)

parser = lentaRu_parser()

tbl = parser.get_articles(param_dict=param_dict,
                         time_step = 5,
                         save_every = 5,
                         save_excel = True)
print(len(tbl.index))
tbl.head()

Parsing articles from 2024-09-15 to 2024-09-20
Parsing articles from 2024-09-21 to 2024-09-26
Parsing articles from 2024-09-27 to 2024-10-02
Parsing articles from 2024-10-03 to 2024-10-08
Parsing articles from 2024-10-09 to 2024-10-14
Checkpoint saved!
Parsing articles from 2024-10-15 to 2024-10-20
Parsing articles from 2024-10-21 to 2024-10-26
Parsing articles from 2024-10-27 to 2024-11-01
Parsing articles from 2024-11-02 to 2024-11-07
Parsing articles from 2024-11-08 to 2024-11-13
Checkpoint saved!
Parsing articles from 2024-11-14 to 2024-11-19
Parsing articles from 2024-11-20 to 2024-11-25
Parsing articles from 2024-11-26 to 2024-12-01
Parsing articles from 2024-12-02 to 2024-12-07
Parsing articles from 2024-12-08 to 2024-12-13
Checkpoint saved!
Parsing articles from 2024-12-14 to 2024-12-15
Finish
80990


Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,docid,url,title,modified,...,domain,status,part,bloc,tags,image_url,pubdate,text,rightcol,snippet
0,0.0,0.0,0.0,0.0,0.0,0.0,1363803,https://lenta.ru/news/2023/01/01/exponenta/,Ким Чен Ын пообещал нарастить производство яде...,1672531825,...,1,0,0,2,[1],https://icdn.lenta.ru/images/2023/01/01/03/202...,1672531825,Фото: ЦТАК / Reuters Марина Совина Лидер КНДР ...,Ким Чен Ын пообещал нарастить производство яде...,Фото: ЦТАК / Reuters Марина Совина Лидер ... я...
1,1.0,1.0,1.0,1.0,1.0,1.0,1363805,https://lenta.ru/news/2023/01/01/yaroslavl/,В российском городе пропал ребенок,1672532081,...,1,0,0,1,[4],https://icdn.lenta.ru/images/2023/01/01/03/202...,1672532081,Фото: Кирилл Шипицин / РИА Новости Марина Сови...,В российском городе пропал ребенок,Фото: Кирилл Шипицин / РИА Новости ... мальчик...
2,2.0,2.0,2.0,2.0,2.0,2.0,1363807,https://lenta.ru/news/2023/01/01/alco/,Россиянам рассказали о влиянии алкоголя на сон,1672533004,...,1,0,0,1,[2],https://icdn.lenta.ru/images/2023/01/01/03/202...,1672533004,Фото: Pixabay Марина Совина Терапевт Ирина Анд...,Россиянам рассказали о влиянии алкоголя на сон,Фото: Pixabay Марина Совина Терапевт ... употр...
3,3.0,3.0,3.0,3.0,3.0,3.0,1363808,https://lenta.ru/news/2023/01/01/ded_moroz/,Подсчитана пенсия Деда Мороза,1672533251,...,1,0,1,1,[2],https://icdn.lenta.ru/images/2023/01/01/03/202...,1672533251,Фото: Илья Наймушин/ РИА Новости Марина Совина...,Подсчитана пенсия Деда Мороза,Фото: Илья Наймушин/ РИА Новости Марина ... Ги...
4,4.0,4.0,4.0,4.0,4.0,4.0,1363806,https://lenta.ru/news/2023/01/01/anomalia_/,Климатолог предупредил о возможных погодных ан...,1672533531,...,1,0,0,12,[281],https://icdn.lenta.ru/images/2023/01/01/03/202...,1672533531,Фото: Komsomolskaya Pravda / Global Look Press...,Климатолог предупредил о возможных погодных ан...,Фото: Komsomolskaya Pravda / Global Look ... К...


In [34]:
tbl['bloc'].value_counts(normalize=True)

bloc
1     0.183912
2     0.173960
3     0.119669
4     0.095222
37    0.079590
8     0.060662
5     0.042746
7     0.038869
47    0.036066
48    0.033473
6     0.030140
9     0.027448
12    0.026682
87    0.026571
0     0.012816
86    0.009297
49    0.002679
11    0.000099
40    0.000086
35    0.000012
Name: proportion, dtype: float64

In [112]:
tbl[tbl['bloc'] == 49].sample(1).iloc[0].title

'Кремль ответил на обновленную стратегию США по Арктике'

In [113]:
tbl = tbl[tbl.bloc.isin([1, 37, 3, 4, 5, 8, 48, 87])]

TagsMap = {1: 0, 
           3: 3, 
           4: 1, 
           5: 8, 
           8: 4, 
           37: 2, 
           48: 7, 
           87: 5}

tbl['topic'] = tbl['bloc'].map(TagsMap)

In [114]:
tbl.shape

(51983, 23)

In [115]:
tbl['topic'].value_counts(normalize=True) # можно сверить с распределением меток классов в соревновании

topic
0    0.286536
3    0.186446
1    0.148356
2    0.124002
4    0.094512
8    0.066599
7    0.052152
5    0.041398
Name: proportion, dtype: float64

In [116]:
tbl_new = tbl[~tbl.text.isna()]
print(len(tbl), len(tbl_new))

51983 50642


In [117]:
tbl_new.to_csv('final_dataset.csv')

## Эксперименты

In [7]:
import pandas as pd
import numpy as np
import re

# Для обработки текста
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)
from stop_words import get_stop_words

# Для векторизации
from sklearn.feature_extraction.text import TfidfVectorizer

# Для моделирования
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [8]:
# Инициализация компонентов natasha
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

In [19]:
# Функция предобработки текста
def preprocess_text(text):
    russian_stopwords = set(get_stop_words("russian"))

    if not isinstance(text, str):
        return ''
    # Создаем документ natasha
    doc = Doc(text.lower())
    
    # Токенизация
    doc.segment(segmenter)
    
    # Морфологический анализ
    doc.tag_morph(morph_tagger)
    
    tokens = []
    for token in doc.tokens:
        if token.text.isalpha():
            token.lemmatize(morph_vocab)
            lemma = token.lemma
            if lemma not in russian_stopwords:
                tokens.append(lemma)
    
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [10]:
tbl_new = pd.read_csv('final_dataset.csv')

In [20]:
tbl_new['clean_text'] = tbl_new['text'].apply(preprocess_text)

In [21]:
X = tbl_new['clean_text']
y = tbl_new['topic']

Сразу выбрал tf-idf как достаточно сильный алгоритм векторизации текста

In [22]:
# Векторизация с использованием TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [128]:
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print(f'Точность Логистической регрессии: {accuracy_logistic:.4f}')

Точность Логистической регрессии: 0.9225


Неплохое качество у логрега, но нужно понимать, что у нас тут дизбаланс классов, так что accuracy не честный

In [135]:
catboost_model = CatBoostClassifier(learning_rate=0.1, iterations=1000, task_type="GPU", devices='0')
catboost_model.fit(X_train, y_train)
y_pred_catboost = catboost_model.predict(X_test)
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
print(f'Точность CatBoost классификатора: {accuracy_catboost:.4f}')

0:	learn: 1.8996964	total: 733ms	remaining: 12m 12s
1:	learn: 1.7773059	total: 1.15s	remaining: 9m 32s
2:	learn: 1.6799303	total: 1.56s	remaining: 8m 39s
3:	learn: 1.6079916	total: 2s	remaining: 8m 18s
4:	learn: 1.5477794	total: 2.43s	remaining: 8m 3s
5:	learn: 1.4916134	total: 2.82s	remaining: 7m 47s
6:	learn: 1.4447258	total: 3.23s	remaining: 7m 37s
7:	learn: 1.3939668	total: 3.68s	remaining: 7m 35s
8:	learn: 1.3543440	total: 4.08s	remaining: 7m 29s
9:	learn: 1.3221197	total: 4.5s	remaining: 7m 25s
10:	learn: 1.2859584	total: 4.94s	remaining: 7m 24s
11:	learn: 1.2491479	total: 5.47s	remaining: 7m 30s
12:	learn: 1.2154729	total: 5.86s	remaining: 7m 24s
13:	learn: 1.1889237	total: 6.21s	remaining: 7m 17s
14:	learn: 1.1650622	total: 6.59s	remaining: 7m 13s
15:	learn: 1.1423478	total: 6.98s	remaining: 7m 9s
16:	learn: 1.1205714	total: 7.36s	remaining: 7m 5s
17:	learn: 1.0974730	total: 7.74s	remaining: 7m 2s
18:	learn: 1.0779804	total: 8.18s	remaining: 7m 2s
19:	learn: 1.0605629	total: 8.

Катбуст показал качество сравнимое, так что я думаю, что плохие данные, так как деревья должны быть сильнее)))

## Тестим

In [26]:
df_test = pd.read_csv('..//first_attempt/test_news.csv')

In [28]:
df_test['clean_text'] = df_test['content'].apply(preprocess_text)

In [136]:
X_kaggle_test = vectorizer.transform(df_test['clean_text'])

y_pred_logistic = logistic_model.predict(X_kaggle_test)
y_pred_catboost = catboost_model.predict(X_kaggle_test)

In [137]:
pd.DataFrame({'topic': y_pred_catboost[:, 0]}).reset_index().to_csv('logistic__sec_answer.csv', index=False)

Так как тут нет 6 класса (Строительство), то я его подтяну из прошлых распаршенных новостных статей (там правда класс недвижимость)

In [None]:
tbl_new = pd.read_csv('after_clean.csv')
old_df = pd.read_csv('../first_attempt/train_prepared.csv')

In [149]:
new_df = pd.concat([tbl_new[['clean_text', 'target']], old_df[old_df['target'] == 6][['clean_text', 'target']]])

In [9]:
new_df.dropna(inplace=True)

In [10]:
X = new_df['clean_text']
y = new_df['target']

In [11]:
# Векторизация с использованием TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [163]:
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print(f'Точность Логистической регрессии: {accuracy_logistic:.4f}')

Точность Логистической регрессии: 0.9131


In [24]:
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight
 
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

In [25]:
catboost_model = CatBoostClassifier(loss_function='MultiClassOneVsAll', class_weights=class_weights, 
                                    learning_rate=0.1, iterations=1000, task_type="GPU", devices='0')
catboost_model.fit(X_train, y_train)
y_pred_catboost = catboost_model.predict(X_test)
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
print(f'Точность CatBoost классификатора: {accuracy_catboost:.4f}')

0:	learn: 0.6287343	total: 708ms	remaining: 11m 47s
1:	learn: 0.5757322	total: 1.18s	remaining: 9m 49s
2:	learn: 0.5314248	total: 1.61s	remaining: 8m 56s
3:	learn: 0.4938894	total: 2.09s	remaining: 8m 39s
4:	learn: 0.4621009	total: 2.57s	remaining: 8m 30s
5:	learn: 0.4344229	total: 3.03s	remaining: 8m 22s
6:	learn: 0.4101353	total: 3.79s	remaining: 8m 57s
7:	learn: 0.3889288	total: 4.2s	remaining: 8m 40s
8:	learn: 0.3702063	total: 4.64s	remaining: 8m 30s
9:	learn: 0.3536890	total: 5.13s	remaining: 8m 27s
10:	learn: 0.3390677	total: 5.59s	remaining: 8m 22s
11:	learn: 0.3259814	total: 6.07s	remaining: 8m 20s
12:	learn: 0.3144146	total: 6.52s	remaining: 8m 15s
13:	learn: 0.3039755	total: 7s	remaining: 8m 12s
14:	learn: 0.2942692	total: 7.49s	remaining: 8m 11s
15:	learn: 0.2857425	total: 7.96s	remaining: 8m 9s
16:	learn: 0.2770846	total: 8.66s	remaining: 8m 20s
17:	learn: 0.2699198	total: 9.06s	remaining: 8m 14s
18:	learn: 0.2631742	total: 9.54s	remaining: 8m 12s
19:	learn: 0.2566644	total

Фигня какая-то

In [29]:
X_kaggle_test = vectorizer.transform(df_test['clean_text'])

# y_pred_logistic = logistic_model.predict(X_kaggle_test)
y_pred_catboost = catboost_model.predict(X_kaggle_test)

In [31]:
pd.DataFrame({'topic': y_pred_catboost[:, 0]}).reset_index().to_csv('weighted_answer.csv', index=False)

Результат по лидерборду чуть лучше: с 0.809 до 0.813, но все равно слабо