In [None]:
import pandas as pd
import re
import ast
import json
from collections import Counter
from dateutil import parser
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import numpy as np
import xgboost as xgb
from lightgbm import LGBMClassifier
import random

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.options.mode.chained_assignment = "warn"

from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('drive/MyDrive/Colab Notebooks/news_before_processing.csv',  sep=';')

###Processing Data


In [None]:
#processing
df = df.drop_duplicates()
df = df.drop(df[df['tag'].apply(lambda x: x == '[]')].index)
df.dropna(inplace=True)

def get_tags(tag_string):
    try:
        tags_list = ast.literal_eval(tag_string)
        if isinstance(tags_list, list):
            if len(tags_list) == 1:
                return tags_list[0]
            else:
                return tag_string
    except:
        return tag_string

    if tag_string == '"Болотное дело"':
        return 'Болотное дело'


# первичная обработка тэгов
df['tags_list'] = df['tag'].apply(get_tags)

def clean_text(text):
    text = str(text)
    text = re.sub(r'^[\r\n\s]+|[\r\n\s]+$', '', text)
    return text.strip()

df['tags_list'] = df['tags_list'].apply(clean_text)

#считаем распространенность тэгов
tags_list = []

for cell in df['tags_list']:
    if '[' in cell:
        tags = [tag.strip().replace("'", "") for tag in cell[1:-1].split(',')]
        for tag in tags:
            tags_list.append(tag.strip())
    else:
        tags_list.append(cell.strip())

tag_counter = Counter(tags_list)

#вторичная обработка тэгов (выбор одного популярного тэга, если представлено несколько)
def select_most_popular_tag(text):
    if re.match(r"\['.*'\]$", text):
        tags = re.findall(r"'([^']+)'", text)
        valid_tags = [tag.strip() for tag in tags if tag_counter.get(tag.strip())]
        return max(valid_tags, key=tag_counter.get)
    else:
        return text

df['most_popular_tag'] = df['tags_list'].apply(select_most_popular_tag)

df.drop('tags_list', axis=1, inplace=True)

In [None]:
#Удаление строк, в которых содержится очень редкий тэг
most_popular_tag_counter = Counter(df['most_popular_tag'])
tags_to_remove = [tag for tag, count in most_popular_tag_counter.items() if count < 100]
df = df[~df['most_popular_tag'].isin(tags_to_remove)]

In [None]:
#преобразование даты
df = df[df['date and time'] != '01.01.0001, 00:00']

month_dict = {
    'января': '01','февраля': '02','марта': '03','апреля': '04','мая': '05',
    'июня': '06','июля': '07','августа': '08','сентября': '09','октября': '10',
    'ноября': '11','декабря': '12'
}

df['date and time'].replace(month_dict, regex=True, inplace=True)

df['date and time'] = df['date and time'].apply(lambda x: parser.parse(x, fuzzy=True) if pd.notna(x) else x)

df['date and time'] = pd.to_datetime(df['date and time'], errors='coerce')

print("Количество неправильных дат:", df['date and time'].isna().sum())

df = df[df['date and time'].dt.year >= 2001]

Количество неправильных дат: 0


In [None]:
#преобразование title, text
def clean_text(text):
    text = str(text)
    text = re.sub(r'^[\r\n\s]+|[\r\n\s]+$', '', text)
    return text.strip()

for col in tqdm(['title', 'text']):
    df[col] = df[col].apply(clean_text)

df = df[df['text'].str.strip() != '']

100%|██████████| 2/2 [01:50<00:00, 55.31s/it]


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1271195 entries, 0 to 1460901
Data columns (total 6 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   url               1271195 non-null  object        
 1   date and time     1271195 non-null  datetime64[ns]
 2   tag               1271195 non-null  object        
 3   title             1271195 non-null  object        
 4   text              1271195 non-null  object        
 5   most_popular_tag  1271195 non-null  object        
dtypes: datetime64[ns](1), object(5)
memory usage: 67.9+ MB


In [None]:
df.to_csv('news_after_processing.csv', index=False, sep=';')

###Processing text and Base Modelling

In [None]:
df = pd.read_csv('drive/MyDrive/Colab Notebooks/news_after_processing.csv',  sep=';')

In [None]:
top = df['most_popular_tag'].value_counts()[:50].index.tolist()
filter = pd.DataFrame(top)
df['flag'] = df['most_popular_tag'].apply(lambda x: x in top)
filter_df = df[df.flag == True]
df_sample = filter_df.sample(n=50000, random_state=42).drop('flag', axis=1)

In [None]:
df = pd.read_csv('drive/MyDrive/Colab Notebooks/data_50k_preproc.csv',  sep=',')

In [None]:
df=df.drop('Unnamed: 0', axis=1)

In [None]:
df.columns

Index(['url', 'date and time', 'tag', 'title', 'text', 'most_popular_tag',
       'preprocessed_text'],
      dtype='object')

In [None]:
from pymystem3 import Mystem

mystem = Mystem()

# Функция для лемматизации текста на русском
def lemmatize_text_russian(text):
    text = str(text)
    text = re.sub(r'^[\r\n\s]+|[\r\n\s]+$', '', text)
    text = re.sub(r'\[|\]', '', text)
    lemmatized_tokens = mystem.lemmatize(text.lower())
    return " ".join([token for token in lemmatized_tokens if token.strip()])

# Пример предварительной обработки текста на русском с лемматизацией
df_sample_processed_russian = [lemmatize_text_russian(text) for text in tqdm(df_sample['text'])]
# lemmatize_text_russian(s)

100%|██████████| 50000/50000 [08:16<00:00, 100.65it/s]


In [None]:
len(df_sample_processed_russian)

50000

In [None]:
df_sample['preprocessed_text'] = df_sample_processed_russian

In [None]:
df[2:3]

Unnamed: 0,url,date and time,tag,title,text,most_popular_tag,preprocessed_text
2,https://www.kommersant.ru/doc/995727,2006-11-15 18:59:00,Происшествия,"От ""Пулково"" требуют более 32 млн рублей","Пострадавшие в результате катастрофы российского пассажирского самолета Ту-154 под Донецком подали иск в Тушинский суд Москвы к авиакомпании ""Пулково"".",Происшествия,"пострадать в результат катастрофа российский пассажирский самолет тот - 154 под донецк подавать иск в тушинский суд москва к авиакомпания "" пулково "" ."


In [None]:
df_sample.to_csv('data_50k_preproc.csv')

In [None]:
df = pd.read_csv('drive/MyDrive/Colab Notebooks/preprocessed_text_50000.csv',  sep=';')

In [None]:
df

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   processed_texts   50000 non-null  object
 1   most_popular_tag  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_colwidth', None)

x_train, x_test, y_train, y_test = train_test_split(df.processed_texts, df.most_popular_tag, test_size=0.2, random_state=42)
x_train
data = pd.DataFrame(columns=['Parameters', 'Accuracy', 'Precision', 'Recall', 'F1-score'])

In [None]:
label_encoder = LabelEncoder()
y_train_mlb = label_encoder.fit_transform(y_train)
y_test_mlb = label_encoder.transform(y_test)

pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer(ngram_range=(1, 2), max_features=15000)),
        ('clf', LogisticRegression(max_iter = 10000, multi_class='ovr', solver='liblinear'))
    ])

ran = np.arange(len(x_train))
inds = np.array_split(ran, 3)

for chunk in tqdm(inds):
        # chunk_with_random = np.concatenate((chunk, np.array(random.sample(list(ran), k=10))))
    pipeline.fit(np.array(x_train)[chunk], y_train_mlb[chunk])

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
import torch.nn.functional as F

label_encoder = LabelEncoder()
y_train_mlb = label_encoder.fit_transform(y_train)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train_mlb, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

ValueError: module functions cannot set METH_CLASS or METH_STATIC

In [None]:
label_encoder = LabelEncoder()
y_train_mlb = label_encoder.fit_transform(y_train)
y_test_mlb = label_encoder.transform(y_test)

pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer(ngram_range=(1, 2), max_features=15000)),
        ('clf', LogisticRegression(max_iter = 10000, multi_class='ovr', solver='liblinear'))
    ])

ran = np.arange(len(x_train))
inds = np.array_split(ran, 3)

for chunk in tqdm(inds):
        # chunk_with_random = np.concatenate((chunk, np.array(random.sample(list(ran), k=10))))
    pipeline.fit(np.array(x_train)[chunk], y_train_mlb[chunk])

 33%|███▎      | 1/3 [00:22<00:45, 22.86s/it]


KeyboardInterrupt: 

In [None]:
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

In [None]:
pipeline

In [None]:
import joblib

joblib.dump(pipeline, 'base_logreg_model.pkl')

['base_logreg_model.pkl']

In [None]:
from pymystem3 import Mystem

mystem = Mystem()

# Функция для лемматизации текста на русском
def lemmatize_text_russian(text):
    lemmatized_tokens = mystem.lemmatize(text.lower())
    return " ".join([token for token in lemmatized_tokens if token.strip()])

# Пример предварительной обработки текста на русском с лемматизацией
x_train_processed_russian = [lemmatize_text_russian(text) for text in x_train]
x_test_processed_russian = [lemmatize_text_russian(text) for text in x_test]

Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


In [None]:
label_encoder = LabelEncoder()
y_train_mlb = label_encoder.fit_transform(y_train)
y_test_mlb = label_encoder.transform(y_test)

def pipline_mod(vect, clasif, data):
    pipeline = Pipeline([
        ('vectorizer', vect),
        ('clf', clasif)
    ])

    ran = np.arange(len(x_train_processed_russian))
    inds = np.array_split(ran, 3)

    for chunk in tqdm(inds):
        # chunk_with_random = np.concatenate((chunk, np.array(random.sample(list(ran), k=10))))
        pipeline.fit(np.array(x_train_processed_russian)[chunk], y_train_mlb[chunk])

    predicted_labels = pipeline.predict(x_test_processed_russian)

    predicted_labels = pipeline.predict(x_test_processed_russian)
    accuracy = accuracy_score(y_test_mlb, predicted_labels)
    precision = precision_score(y_test_mlb, predicted_labels, average='weighted')
    recall = recall_score(y_test_mlb, predicted_labels, average='weighted')
    f1 = f1_score(y_test_mlb, predicted_labels, average='weighted')

    params = str(pipeline.named_steps['clf']) + ' ' + str(pipeline.named_steps['clf'].get_params()) + ' ' + str(pipeline.named_steps['vectorizer'].get_params())
    data = data.append({'Parameters': params, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-score': f1}, ignore_index=True)

    return data

In [None]:
vect = TfidfVectorizer(ngram_range=(1, 2), max_features=15000)
clfs = [SGDClassifier(max_iter=100000, loss='log_loss', random_state=42),
        SGDClassifier(max_iter=100000, loss='squared_error', random_state=42)]
for clf in clfs:
  data = pipline_mod(vect, clf, data)

100%|██████████| 3/3 [00:58<00:00, 19.60s/it]
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 3/3 [01:09<00:00, 23.02s/it]
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
vect = TfidfVectorizer(ngram_range=(1, 1), max_features=15000)
data = pipline_mod(vect, RandomForestClassifier(n_estimators=100, random_state=42), data)

100%|██████████| 3/3 [02:27<00:00, 49.11s/it]
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
vect = TfidfVectorizer(ngram_range=(1, 1), max_features=15000)
data = pipline_mod(vect, SVC(random_state=42), data)

100%|██████████| 3/3 [32:22<00:00, 647.48s/it]
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
vect = TfidfVectorizer(ngram_range=(1, 1), max_features=15000)
clfs = [SGDClassifier(max_iter=100000, loss='log_loss', random_state=42),
        SGDClassifier(max_iter=100000, loss='squared_error', random_state=42),
        LogisticRegression(max_iter = 100000, multi_class='ovr', solver='liblinear'),
        LogisticRegression(max_iter = 10000, multi_class='ovr', solver='liblinear')]
for clf in clfs:
  data = pipline_mod(vect, clf, data)

100%|██████████| 3/3 [00:30<00:00, 10.19s/it]
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 3/3 [00:44<00:00, 14.87s/it]
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 3/3 [01:19<00:00, 26.34s/it]
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 3/3 [01:20<00:00, 26.80s/it]
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
vect = TfidfVectorizer(ngram_range=(1, 1), max_features=15000)
data = pipline_mod(vect, SVC(random_state=42), data)

In [None]:
vect = TfidfVectorizer(ngram_range=(1, 2), max_features=15000)
clfs = [xgb.XGBClassifier(random_state=42),
        LogisticRegression(max_iter = 10000, multi_class='ovr', solver='liblinear')]
for clf in clfs:
  data = pipline_mod(vect, clf, data)

100%|██████████| 3/3 [1:00:57<00:00, 1219.01s/it]
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 3/3 [01:42<00:00, 34.12s/it]
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
vects = [
         TfidfVectorizer(ngram_range=(1, 1), max_features=25000),
         TfidfVectorizer(ngram_range=(1, 2), max_features=15000),
         TfidfVectorizer(ngram_range=(2, 2), max_features=15000)]

for vect in vects:
  data = pipline_mod(vect, SVC(random_state=42), data)

100%|██████████| 3/3 [34:10<00:00, 683.55s/it]
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 3/3 [35:15<00:00, 705.15s/it]
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 3/3 [14:09<00:00, 283.26s/it]
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
vect = TfidfVectorizer(ngram_range=(1, 2), max_features=15000)
clfs = [LGBMClassifier(random_state=42)]
for clf in clfs:
  data = pipline_mod(vect, clf, data)

In [None]:
data

Unnamed: 0,Parameters,Accuracy,Precision,Recall,F1-score
0,"RandomForestClassifier(random_state=42) {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False} {'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': 15000, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None}",0.5141,0.565953,0.5141,0.454871
1,"SVC(random_state=42) {'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': 42, 'shrinking': True, 'tol': 0.001, 'verbose': False} {'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': 15000, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None}",0.4448,0.507645,0.4448,0.388758
2,"RandomForestClassifier(random_state=42) {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False} {'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': 15000, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None}",0.5816,0.609529,0.5816,0.537953
3,"SGDClassifier(loss='log_loss', max_iter=100000, random_state=42) {'alpha': 0.0001, 'average': False, 'class_weight': None, 'early_stopping': False, 'epsilon': 0.1, 'eta0': 0.0, 'fit_intercept': True, 'l1_ratio': 0.15, 'learning_rate': 'optimal', 'loss': 'log_loss', 'max_iter': 100000, 'n_iter_no_change': 5, 'n_jobs': None, 'penalty': 'l2', 'power_t': 0.5, 'random_state': 42, 'shuffle': True, 'tol': 0.001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False} {'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': 15000, 'min_df': 1, 'ngram_range': (1, 2), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None}",0.608,0.604146,0.608,0.557627
4,"SGDClassifier(loss='squared_error', max_iter=100000, random_state=42) {'alpha': 0.0001, 'average': False, 'class_weight': None, 'early_stopping': False, 'epsilon': 0.1, 'eta0': 0.0, 'fit_intercept': True, 'l1_ratio': 0.15, 'learning_rate': 'optimal', 'loss': 'squared_error', 'max_iter': 100000, 'n_iter_no_change': 5, 'n_jobs': None, 'penalty': 'l2', 'power_t': 0.5, 'random_state': 42, 'shuffle': True, 'tol': 0.001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False} {'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': 15000, 'min_df': 1, 'ngram_range': (1, 2), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None}",0.6349,0.600102,0.6349,0.595787
5,"SGDClassifier(loss='log_loss', max_iter=100000, random_state=42) {'alpha': 0.0001, 'average': False, 'class_weight': None, 'early_stopping': False, 'epsilon': 0.1, 'eta0': 0.0, 'fit_intercept': True, 'l1_ratio': 0.15, 'learning_rate': 'optimal', 'loss': 'log_loss', 'max_iter': 100000, 'n_iter_no_change': 5, 'n_jobs': None, 'penalty': 'l2', 'power_t': 0.5, 'random_state': 42, 'shuffle': True, 'tol': 0.001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False} {'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': 15000, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None}",0.6057,0.583553,0.6057,0.553933
6,"SGDClassifier(loss='squared_error', max_iter=100000, random_state=42) {'alpha': 0.0001, 'average': False, 'class_weight': None, 'early_stopping': False, 'epsilon': 0.1, 'eta0': 0.0, 'fit_intercept': True, 'l1_ratio': 0.15, 'learning_rate': 'optimal', 'loss': 'squared_error', 'max_iter': 100000, 'n_iter_no_change': 5, 'n_jobs': None, 'penalty': 'l2', 'power_t': 0.5, 'random_state': 42, 'shuffle': True, 'tol': 0.001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False} {'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': 15000, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None}",0.6231,0.590221,0.6231,0.58514
7,"LogisticRegression(max_iter=100000, multi_class='ovr', solver='liblinear') {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100000, 'multi_class': 'ovr', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False} {'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': 15000, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None}",0.6181,0.61088,0.6181,0.569433
8,"LogisticRegression(max_iter=10000, multi_class='ovr', solver='liblinear') {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 10000, 'multi_class': 'ovr', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False} {'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': 15000, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None}",0.6181,0.61088,0.6181,0.569433
9,"SVC(random_state=42) {'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': 42, 'shrinking': True, 'tol': 0.001, 'verbose': False} {'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': 15000, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None}",0.6186,0.611694,0.6186,0.570238


In [None]:
data.to_excel('data.xlsx')

## Others

In [None]:
# SGDClassifier и fasttext
label_encoder = LabelEncoder()
y_train_mlb = label_encoder.fit_transform(y_train)
y_test_mlb = label_encoder.transform(y_test)

pipeline = Pipeline([
    ('fasttext', FastTextVectorizer()),
    ('clf', SGDClassifier(max_iter=1000, loss='log'))
])

ran = np.arange(len(x_train))
inds = np.array_split(ran, 30)

for chunk in tqdm(inds):
    chunk_with_random = np.concatenate((chunk, np.array(random.sample(list(ran), k=1000))))
    pipeline.fit(np.array(x_train)[chunk_with_random], y_train_mlb[chunk_with_random])

predicted_labels = pipeline.predict(x_test)

predicted_labels = pipeline.predict(x_test)
accuracy = accuracy_score(y_test_mlb, predicted_labels)
precision = precision_score(y_test_mlb, predicted_labels, average='weighted')
recall = recall_score(y_test_mlb, predicted_labels, average='weighted')
f1 = f1_score(y_test_mlb, predicted_labels, average='weighted')

params = str(pipeline.named_steps['tfidf'].get_params()) + str(pipeline.named_steps['clf'].get_params())
data = data.append({'Parameters': params, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-score': f1}, ignore_index=True)

In [None]:
#Вариант предобработки текста
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Zа-яА-ЯёЁ\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('russian'))
    tokens = [word for word in tokens if word not in stop_words]

    # Лемматизация
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text


df['preprocessed_text'] = df['text'].apply(preprocess_text)

##DL


In [None]:
df = pd.read_csv('drive/MyDrive/Colab Notebooks/preprocessed_text_50000.csv',  sep=';')

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_colwidth', None)

x_train, x_test, y_train, y_test = train_test_split(df.processed_texts, df.most_popular_tag, test_size=0.2, random_state=42)
x_train
data = pd.DataFrame(columns=['Parameters', 'Accuracy', 'Precision', 'Recall', 'F1-score'])

In [None]:
# !pip install fasttext

import fasttext
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import os

In [None]:
label_encoder = LabelEncoder()
y_train_mlb = label_encoder.fit_transform(y_train)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train_mlb, test_size=0.2, random_state=42)

def prepare_fasttext_data(x, y, file_path):
    with open(file_path, 'w') as f:
        for text, label in zip(x, y):
            f.write(f"__label__{label} {text}\n")

prepare_fasttext_data(x_train, y_train, 'train.txt')
prepare_fasttext_data(x_val, y_val, 'val.txt')

In [None]:
def evaluate_fasttext(model, x, y):
    y_pred = []
    for text in x:
        label = model.predict(text)[0][0]
        y_pred.append(int(label.replace("__label__", "")))
    accuracy = accuracy_score(y, y_pred)
    f1 = f1_score(y, y_pred, average='weighted')
    return accuracy, f1

num_epochs = 10

for epoch in range(num_epochs):
    model = fasttext.train_supervised(input='train.txt', epoch=epoch+1, lr=1.0, wordNgrams=2, verbose=2, minCount=1)

    val_accuracy, val_f1 = evaluate_fasttext(model, x_val, y_val)
    print(f"Epoch: {epoch+1}, Validation Accuracy: {val_accuracy}, Validation F1 Score: {val_f1}")

Epoch: 1, Validation Accuracy: 0.5785, Validation F1 Score: 0.5153227861303956
Epoch: 2, Validation Accuracy: 0.67475, Validation F1 Score: 0.6523073784227519
Epoch: 3, Validation Accuracy: 0.69375, Validation F1 Score: 0.6783723746787979
Epoch: 4, Validation Accuracy: 0.695875, Validation F1 Score: 0.6837017569681205
Epoch: 5, Validation Accuracy: 0.696375, Validation F1 Score: 0.6866987780013901
Epoch: 6, Validation Accuracy: 0.697125, Validation F1 Score: 0.689524265457651
Epoch: 7, Validation Accuracy: 0.696375, Validation F1 Score: 0.6899482694895099
Epoch: 8, Validation Accuracy: 0.6955, Validation F1 Score: 0.6898448313304136
Epoch: 9, Validation Accuracy: 0.696125, Validation F1 Score: 0.690593486374898
Epoch: 10, Validation Accuracy: 0.696, Validation F1 Score: 0.6905138847266203


In [None]:
y_test_mlb = label_encoder.transform(y_test)
prepare_fasttext_data(x_test, y_test_mlb, 'test.txt')
test_accuracy, test_f1 = evaluate_fasttext(model, x_test, y_test_mlb)
print(f"Final Test Accuracy: {test_accuracy}")
print(f"Final Test F1 Score: {test_f1}")

Final Test Accuracy: 0.6994
Final Test F1 Score: 0.6935512125268926


In [None]:
model.save_model("fasttext_model.bin")

In [None]:
import joblib

joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']