# GoogleDrive

In [1]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


# Main

In [2]:
import ast
import logging
import multiprocessing

import pandas as pd
import numpy as np
import seaborn as sns

import requests
import datetime
from datetime import datetime, date, timedelta

In [3]:
# logging.root.level = logging.ERROR # disable word2vec eblans warnings
logging.root.level = logging.DEBUG

random_state = 42

BASE_PATH = 'drive/MyDrive/Colab Notebooks/studcamp/' # for google drive
# BASE_PATH = '' # for local

data_path = BASE_PATH + 'data.csv'
data_pickle_path = BASE_PATH + 'data.pkl'

date_format = '%Y/%m/%d'

In [4]:
def save_df(df: pd.DataFrame):
    df.to_csv(data_path, index=False)


def read_df() -> pd.DataFrame:
    return pd.read_csv(data_path)


def save_df_pickle(df: pd.DataFrame):
    df.to_pickle(data_pickle_path)


def read_df_pickle() -> pd.DataFrame:
    return pd.read_pickle(data_pickle_path)

In [5]:
labels = {
    'Общество': 0,
    'Россия': 0,

    'Экономика': 1,
    'Силовые структуры': 2,
    'Бывший СССР': 3,
    'Спорт': 4,
    'Забота о себе': 5,
    'Строительство': 6,

    'Туризм': 7,
    'Путешествия': 7,

    'Наука и техника': 8,
}

labels_set = set(labels.keys())
labels_set

{'Бывший СССР',
 'Забота о себе',
 'Наука и техника',
 'Общество',
 'Путешествия',
 'Россия',
 'Силовые структуры',
 'Спорт',
 'Строительство',
 'Туризм',
 'Экономика'}

In [13]:
# df = pd.DataFrame(columns=['topic', 'article'])

# df = read_df()

In [None]:
save_df(df)

In [6]:
df = read_df_pickle()

In [78]:
save_df_pickle(df)

In [7]:
len(df)

6661

In [79]:
df

Unnamed: 0,topic,article,topic_id,article_preprocessed,vector
0,Силовые структуры,«Я возил им оружие из Махачкалы» Что удалось у...,2,"[возить, оружие, махачкала, удаваться, узнават...","[0.60661364, -0.24199635, -0.0969362, -0.34536..."
1,Экономика,Россияне массово скупают золото. Почему в него...,1,"[россиянин, массово, скупать, золото, почему, ...","[-0.25460634, -0.36730328, -0.2412432, -0.8133..."
2,Экономика,Запретная упаковка. В России хотят отменить че...,1,"[запретный, упаковка, россия, хотеть, отменять...","[-0.07687777, -0.34758478, 0.25802106, -0.9753..."
3,Наука и техника,«Порочные методы следствия» После смерти Стали...,8,"[порочный, метод, следствие, смерть, сталин, с...","[0.23195523, -0.26264107, 0.31117004, 0.044719..."
4,Наука и техника,Уникальный гаджет поступил в продажу в России....,8,"[уникальный, гаджет, поступать, продажа, росси...","[0.034427293, 0.015680952, -0.21674196, -0.415..."
...,...,...,...,...,...
6656,Экономика,Прогноз по году Оценки роста ВВП в 2010 году в...,1,"[прогноз, год, оценка, рост, ввп, год, вновь, ...","[0.14905669, -0.33300966, 0.11172507, -0.28476..."
6657,Бывший СССР,Кому она нужна? Безделье в украинской Раде обр...,3,"[нужный, безделье, украинский, рада, обретать,...","[-0.22620815, -0.5205046, 0.22890382, 0.072475..."
6658,Наука и техника,Фантастическая четверка Микки-Маусов Disney до...,8,"[фантастический, четверка, микки, маусов, дого...","[0.3764503, 0.08406858, -0.07813962, -0.684656..."
6659,Экономика,Неинтересно Популярность негосударственных пен...,1,"[неинтересно, популярность, негосударственный,...","[-0.14288117, -0.44632474, -0.3792301, -1.0764..."


# Parser

In [None]:
!pip install loguru fake_useragent

In [None]:
from loguru import logger
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

In [None]:
LOGGING = 'logs.log'
logger.add(LOGGING, level='DEBUG')

1

In [None]:
def add_day(d: date) -> date:
    return d + timedelta(days=1)


def get_date_as_str(d: date):
    return d.strftime(date_format)


def get_url_from_date(d: date) -> str:
    return f'https://lenta.ru/articles/{get_date_as_str(d)}/'


def get_url_from_href(s: str) -> str:
    return f'https://lenta.ru/{s}'


def get_page_tree_by_date(response) -> BeautifulSoup:
    tree = BeautifulSoup(response.content, 'html.parser')
    return tree


def get_href_from_new(new) -> str:
    return new.a.get('href')


def requests_get(url: str):
    response = requests.get(url, headers={'User-Agent': UserAgent().chrome})
    return response

In [None]:
def parse_lentaru(d: date, save_every_min: int = 5) -> None:
    global df

    if save_every_min <= 0:
        save_every_min = 5

    d_1 = datetime.now()

    while True:
        url = get_url_from_date(d)

        response = requests_get(url)
        if response.status_code != 200:
            logger.error(response.status_code)
            save_df(df)
            logger.warning('Checkpoint saved!')
            return

        tree = get_page_tree_by_date(response)
        news_li = tree.find_all('li', {'class': 'archive-page__item _article'})

        result_pages = 0
        for new in news_li:
            full_article_url = get_href_from_new(new)
            full_article_url = get_url_from_href(full_article_url) # https://lenta.ru//articles/2022/12/02/prison/

            response = requests_get(full_article_url)
            if response.status_code != 200:
                logger.error(response.status_code)
                return

            full_article_tree = get_page_tree_by_date(response)

            topic = full_article_tree.find('a', {'class': 'topic-header__item topic-header__rubric'})
            if not topic:
                continue

            topic = topic.text
            if topic not in labels_set:
                continue

            article = full_article_tree.find('div', {'class': 'topic-body _article'}).text
            if not article:
                continue

            df = pd.concat([df, pd.DataFrame({'topic': [topic], 'article': [article]})], ignore_index=True)
            result_pages += 1

        d -= timedelta(days=1)
        logger.info(f'{get_date_as_str(d)} - {result_pages} pages')

        d_2 = datetime.now()
        if (d_2 - d_1).seconds // 60 >= save_every_min:
            d_1 = d_2
            save_df(df)
            logger.warning('Checkpoint saved!')

In [None]:
df.groupby('topic', group_keys=False)['article'].count()

In [None]:
labels_counts = df.groupby('topic_id', group_keys=False)['article'].count()

In [None]:
sns.barplot(labels_counts)

In [None]:
df = df[df['topic'].isin(labels_set)]

In [None]:
save_df(df)

In [None]:
# start = date.today()

start = datetime.strptime('2009/08/28', date_format) - timedelta(days=1)

parse_lentaru(start, save_every_min=5)

# LabelEncoder

In [46]:
df['topic_id'] = df['topic'].map(labels)

In [None]:
df['topic_id'].value_counts(normalize=True)

# Process & Models

In [134]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, classification_report

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.decomposition import PCA

from gensim.models import Word2Vec

from tqdm import tqdm


tqdm.pandas()

## preprocess text

In [10]:
import re

import nltk
from pymystem3 import Mystem
from string import punctuation

nltk.download("stopwords")
russian_stopwords = nltk.corpus.stopwords.words("russian")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
# Create lemmatizer and stopwords list
mystem = Mystem()

# Preprocess function
def preprocess_text(text):
    text = re.sub('[^а-яёА-ЯЁ]', ' ', text) # оставляем только кириллицу
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords \
              and token != " " \
              and token.strip() not in punctuation]

    return text

Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


In [None]:
# df['article_preprocessed'] = df['article'].progress_apply(preprocess_text)

100%|██████████| 6661/6661 [00:01<00:00, 4948.24it/s]


In [50]:
# Article_preprocessed and Vector are lists

# df['article_preprocessed'] = df['article_preprocessed'].progress_apply(ast.literal_eval)

100%|██████████| 6661/6661 [00:20<00:00, 327.14it/s]


## Vectorization

In [76]:
def vectorize(sentence: list):
    Sum = 0
    Count = 0

    for w in sentence:
        if w in w2v.wv:
            Sum += w2v.wv[w]
            Count += 1

    if Count == 0:
        return 0

    return Sum / Count

In [None]:
w2v = Word2Vec(
    workers=multiprocessing.cpu_count()
)

w2v.build_vocab(df['article_preprocessed'], progress_per=10000)

w2v.train(
    df['article_preprocessed'], total_examples=w2v.corpus_count, epochs=30, report_delay=1
)

In [75]:
# w2v.save(BASE_PATH + "word2vec.model")
# w2v = Word2Vec.load(BASE_PATH + "word2vec.model")

INFO:gensim.utils:Word2Vec lifecycle event {'fname_or_handle': 'drive/MyDrive/Colab Notebooks/studcamp/word2vec.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-04-09T15:15:19.900194', 'gensim': '4.3.2', 'python': '3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]', 'platform': 'Linux-6.1.58+-x86_64-with-glibc2.35', 'event': 'saving'}
INFO:gensim.utils:not storing attribute cum_table
DEBUG:smart_open.smart_open_lib:{'uri': 'drive/MyDrive/Colab Notebooks/studcamp/word2vec.model', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:saved drive/MyDrive/Colab Notebooks/studcamp/word2vec.model


In [77]:
df['vector'] = df['article_preprocessed'].progress_apply(vectorize)

100%|██████████| 6661/6661 [00:19<00:00, 333.44it/s]


In [39]:
# pca = PCA(n_components=30)
# pca.fit(X)

## Split dataset

In [80]:
X, y = df['vector'].to_numpy(), df['topic_id'].to_numpy(dtype=int)

In [81]:
X = np.concatenate(X).reshape(len(X), -1) # if arr of arrays
# X = X.reshape(-1, 1) # if arr of numbers

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
X_train.shape, X_test.shape

((5328, 100), (1333, 100))

# Train

In [35]:
def print_scores(accuracy):
  print('accuracy: ', f'train {accuracy[0]}, test {accuracy[1]}')


def get_vectorizer_Xtrain_Xtest(vectorizer, X_train, X_test):
  vectorizer.fit(X_train)
  Xtrain = vectorizer.transform(X_train)
  Xtest = vectorizer.transform(X_test)

  print(f"{Xtrain.shape = }", f"{Xtest.shape = }")
  return vectorizer, Xtrain, Xtest


def get_predictions(model, Xtrain, Xtest, y_train, y_test):
  model.fit(Xtrain, y_train)

  train_pred = model.predict(Xtrain)
  test_pred = model.predict(Xtest)

  class_report = classification_report(y_test, test_pred)
  return model, train_pred, test_pred, class_report


def get_compute_metrics(y_train, y_test, train_pred, test_pred):
  accuracy = accuracy_score(y_train, train_pred), accuracy_score(y_test, test_pred)
  return accuracy

## LogisticRegression

In [83]:
from sklearn.linear_model import LogisticRegression

In [None]:
# vec, Xtrain, Xtest = get_vectorizer_Xtrain_Xtest(CountVectorizer(), X_train, X_test)

In [126]:
model_logreg, train_pred, test_pred, class_report = \
          get_predictions(LogisticRegression(max_iter=1000, random_state=random_state), X_train, X_test, y_train, y_test)

In [None]:
# model = LogisticRegression(max_iter=200, random_state=random_state)
# model.fit(X_train, y_train)

In [127]:
print(class_report)

              precision    recall  f1-score   support

           0       0.83      0.84      0.84       386
           1       0.87      0.92      0.89       266
           2       0.68      0.63      0.65        81
           3       0.91      0.84      0.87       151
           4       0.99      0.97      0.98       180
           5       0.84      0.88      0.86        24
           7       0.92      0.85      0.88        54
           8       0.86      0.86      0.86       191

    accuracy                           0.87      1333
   macro avg       0.86      0.85      0.86      1333
weighted avg       0.87      0.87      0.87      1333



In [128]:
accuracy = get_compute_metrics(y_train, y_test, train_pred, test_pred)
print_scores(accuracy)

accuracy:  train 0.8888888888888888, test 0.8664666166541636


## MultinomialNB

In [None]:
labels

In [None]:
labels_counts

In [42]:
from sklearn.naive_bayes import MultinomialNB

In [125]:
# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(X_train)
# X_test_counts = count_vect.transform(X_test)

# model_MultinomialNB = MultinomialNB().fit(X_train_counts, y_train)
# model_MultinomialNB

# predicted_proba = model_MultinomialNB.predict_proba(X_test_counts)
# predicted = model_MultinomialNB.predict(X_test_counts)

# accuracy_score(y_test, predicted)

model_MultinomialNB = MultinomialNB().fit(X_train, y_train)
predicted = model_MultinomialNB.predict(X_test)
accuracy_score(y_test, predicted)

ValueError: Negative values in data passed to MultinomialNB (input X)

## LinearSVC

In [87]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X)
tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_transformed = tf_transformer.transform(X_train_counts)

X_test_counts = count_vect.transform(X_test)
X_test_transformed = tf_transformer.transform(X_test_counts)

In [88]:
model = LinearSVC()
# model.fit(X_train_transformed, y)

model.fit(X_train, y_train)



In [89]:
calibrated_svc = CalibratedClassifierCV(model, cv="prefit")

In [90]:
# calibrated_svc.fit(X_train_transformed, y)
calibrated_svc.fit(X_train, y_train)

In [91]:
predicted = calibrated_svc.predict(X_test)
accuracy_score(y_test, predicted)

0.8612153038259565

## CatBoost

In [None]:
!pip install catboost

In [107]:
from catboost import CatBoostClassifier
from catboost.text_processing import Tokenizer

In [115]:
def fit_model(X_train, y_train):
    model = CatBoostClassifier(
        random_seed=random_state,
        iterations=1000,
        # learning_rate=0.05,
        eval_metric='Accuracy',
        task_type='CPU',
        verbose=True,

    )
    # model = make_pipeline(CountVectorizer(), cat)

    return model.fit(X_train, y_train)

In [None]:
model_catboost = fit_model(X_train, y_train)

In [118]:
 model_catboost.score(X_test, y_test)

0.8717179294823706

## TfidfVectorizer + MultinomialNB

In [None]:
tfidf_vect = TfidfVectorizer()
X_train_counts = tfidf_vect.fit_transform(X_train)
X_test_counts = tfidf_vect.transform(X_test)

NameError: name 'TfidfVectorizer' is not defined

In [None]:
model_MultinomialNB = MultinomialNB().fit(X_train_counts, y_train)
model_MultinomialNB

predicted_proba = model_MultinomialNB.predict_proba(X_test_counts)
predicted = model_MultinomialNB.predict(X_test_counts)

accuracy_score(y_test, predicted)

## RandomForest

In [49]:
from sklearn.ensemble import RandomForestClassifier

In [50]:
model = RandomForestClassifier()
model = model.fit(X_train, y_train)

In [51]:
predicted = model.predict(X_test)
accuracy_score(y_test, predicted)

0.8102025506376594

## MLPClassifier

In [92]:
from sklearn.neural_network import MLPClassifier

In [124]:
model = MLPClassifier(
  solver='lbfgs', alpha=1e-5,
  hidden_layer_sizes=(100, 20, 10), random_state=random_state
)

In [95]:
model = model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [96]:
predicted = model.predict(X_test)
accuracy_score(y_test, predicted)

0.8619654913728432

## KNN

In [129]:
from sklearn.neighbors import KNeighborsClassifier

In [135]:
k_range = list(range(1,100))
weight_options = ["uniform", "distance"]
param_grid = dict(n_neighbors=k_range, weights=weight_options)

model = RandomizedSearchCV(KNeighborsClassifier(), param_grid, cv=10, scoring='accuracy', verbose=1)

In [136]:
model.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [139]:
model.best_params_ # {'weights': 'uniform', 'n_neighbors': 6}

{'weights': 'uniform', 'n_neighbors': 6}


# Kaggle solving

## preprocessing

In [97]:
base_submission_news_path = BASE_PATH + 'news-topics-2024/base_submission_news.csv'
test_news_path = BASE_PATH + 'news-topics-2024/test_news.csv'
test_news_pickle_path = BASE_PATH + 'news-topics-2024/test_news.pkl'

In [61]:
base_submission_news = pd.read_csv(base_submission_news_path, sep=',', index_col='index')
# test_news = pd.read_csv(test_news_path)
test_news = pd.read_pickle(test_news_pickle_path)

In [98]:
test_news.head()

Unnamed: 0,content,content_preprocessed,vector
0,Фото: «Фонтанка.ру»ПоделитьсяЭкс-министру обор...,"[фото, фонтанка, ру, поделитьсяэкс, министр, о...","[-0.07582188, 0.0044638994, 0.09999462, 0.1718..."
1,В начале февраля 2023 года в Пушкинском районе...,"[начало, февраль, год, пушкинский, район, санк...","[-0.3711374, -0.043105647, 0.06335098, -0.2911..."
2,Фото: Andy Bao / Getty Images Анастасия Борисо...,"[фото, анастасия, борисов, международный, феде...","[-0.6973815, 0.5147773, 0.13142203, 0.25320646..."
3,"Если вы хотели, но так и не съездили на море л...","[хотеть, съездить, море, лето, читать, далеко,...","[-0.12116184, 0.12488917, -0.033703387, 0.2104..."
4,Сергей Пиняев Фото: Алексей Филиппов / РИА Нов...,"[сергей, пиняев, фото, алексей, филиппов, риа,...","[-0.3308527, 0.63478416, -0.009168504, 0.23864..."


In [100]:
# test_news['content_preprocessed'] = test_news['content'].progress_apply(preprocess_text)
# test_news['content_preprocessed'] = test_news['content_preprocessed'].progress_apply(ast.literal_eval)
# test_news['vector'] = test_news['content_preprocessed'].progress_apply(vectorize)

# test_news.to_csv(BASE_PATH + 'news-topics-2024/test_news.csv', index=False)
# test_news.to_pickle(BASE_PATH + 'news-topics-2024/test_news.pkl')

In [None]:
base_submission_news

In [None]:
base_submission_news.value_counts(normalize=True)

## preds

In [140]:
# Define current model
clf = model
clf

In [None]:
# bow_test_news = count_vect.transform(test_news.content_preprocessed)

In [141]:
X_kaggle = np.concatenate(test_news.vector).reshape(len(test_news.vector), -1) # if arr of arrays

kaggle_pred = clf.predict(X_kaggle)
# kaggle_predproba = clf.predict_proba(bow_test_news)

# kaggle_pred = clf.predict(test_news.content) # for catboost

In [142]:
kaggle_pred

array([0, 1, 4, ..., 0, 3, 3])

In [None]:
kaggle_predproba

In [None]:
kaggle_predproba[0]

In [144]:
base_submission_news['topic'] = kaggle_pred
base_submission_news

Unnamed: 0_level_0,topic
index,Unnamed: 1_level_1
0,0
1,1
2,4
3,7
4,4
...,...
26270,0
26271,0
26272,0
26273,3


In [None]:
base_submission_news['topic'].value_counts(normalize=True)

In [145]:
base_submission_news.to_csv('drive/MyDrive/Colab Notebooks/studcamp/phil_result.csv', sep=',')

# Kaggle score

**LogisticRegression**: 0.61151

**MultinomialNB**: 0.72819

**LinearSVC**: 0.63663

**CatBoost**: 0.70971

**KNeighborsClassifier**: 0.67377