# Загрузка и работа с данными

In [1]:
RANDOM_STATE = 1337

In [6]:
import pandas as pd
import spacy
import re
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
    
class TextProcessing:
    def __init__(self, batch_size=100, n_threads=8, model_init=True):
        self.batch_size = batch_size
        self.n_threads = n_threads
        if model_init:
            self.nlp = spacy.load("ru_core_news_sm", disable=["parser", "ner"])
        else:
            self.nlp = None

    def _clean_text(self, text):
        # Удаляем знаки препинания и цифры
        return re.sub(r'[^\w\s]', ' ', text)

    def _process_batch(self, texts):
        cleaned_texts = [self._clean_text(text) for text in texts]
        lemmatized_texts = []

        for doc in self.nlp.pipe(cleaned_texts, batch_size=self.batch_size):
            lemmas = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.is_alpha]
            lemmatized_texts.append(' '.join(lemmas))

        return lemmatized_texts

    def process_texts(self, text_series: pd.Series) -> pd.Series:
        texts = text_series.tolist()
        total = len(texts)
        batch_size = self.batch_size

        results = [None] * total

        def process_and_store(start_idx, end_idx):
            batch = texts[start_idx:end_idx]
            processed_batch = self._process_batch(batch)
            return start_idx, processed_batch

        futures = []
        with ThreadPoolExecutor(max_workers=self.n_threads) as executor:
            for start_idx in range(0, total, batch_size):
                end_idx = min(start_idx + batch_size, total)
                futures.append(executor.submit(process_and_store, start_idx, end_idx))

            with tqdm(total=len(futures), desc="Processing batches", leave=True) as pbar:
                for future in as_completed(futures):
                    start_idx, processed_batch = future.result()
                    results[start_idx:start_idx+len(processed_batch)] = processed_batch
                    pbar.update(1)

        return pd.Series(results, index=text_series.index)
    

class TextProcessingUpgrade(TextProcessing):
    def __init__(self, batch_size=100, n_threads=8):
        super().__init__(batch_size, n_threads, model_init=False)
        if spacy.prefer_gpu():
            print("✅ Используется GPU")
        else:
            print("⚠️ GPU не доступен, используется CPU")
        self.nlp = spacy.load("ru_core_news_lg", disable=["parser", "ner"])
        # self.nlp.enable_pipe("senter")
        # self.nlp.select_pipes(enable=["tok2vec", "tagger", "lemmatizer"])

    def _clean_text(self, text):
        text = text.lower()
        
        # Обрабатываем дефисы в составных словах
        text = re.sub(r'(?<=[а-яё])(-)(?=[а-яё])', ' ', text)
        
        # Удаляем все символы кроме букв и пробелов
        text = re.sub(r'[^а-яё\s]', ' ', text)
        
        # Удаляем одиночные буквы и цифры
        text = re.sub(r'\s[а-яё]\s', ' ', text)
        
        # Сжимаем множественные пробелы
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    
class TextProcessingRusvect(TextProcessingUpgrade):
    __pos_suffix_map = {
    'ADJ': '_ADJ', 'NOUN': '_NOUN', 'VERB': '_VERB', 'NUM': '_NUM',
    'ADV': '_ADV', 'PROPN': '_NOUN', 'PRON': '_NOUN', 'ADP': '_ADP',
    'CCONJ': '_CONJ', 'PART': '_PART', 'INTJ': '_INTJ',
    }
    def __init__(self, batch_size=100, n_threads=8):
        super().__init__(batch_size, n_threads)


    def _process_batch(self, texts):
        processed_texts = []
        
        for doc in self.nlp.pipe(texts):  # Process the entire batch of texts
            processed_words = []
            for token in doc:
                if not token.is_stop and not token.is_punct and token.is_alpha:
                    processed_words.append(f"{token.text}{self.__pos_suffix_map.get(token.pos_, '_UNKN')}")
            processed_texts.append(' '.join(processed_words))
            
        return processed_texts
    
    
from typing import Optional
import pandas as pd
import random
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api

class VectorizerModel:
    """
    Класс предназначен для инициализации моделей и их использования.
    """
    __params = {   
        "vector_size": 300,         
        "window": 5,                 
        "min_count": 5,             
        "workers": 6,              
        "sg": 1,                      
        "epochs":10,
    }
    def __init__(self, model):
        self.model: Word2Vec = model      
        
        
    @classmethod
    def create(cls, file_name, df: pd.DataFrame, w2v_params=Optional[dict]):
        params = cls.__params if w2v_params is None else w2v_params
        new_model = Word2Vec([sentence.split() for sentence in df['processed_full_text'].tolist()], **params)
        new_model.save(f"{file_name}.wordvectors")
        return cls(new_model)
    
    @classmethod
    def load(cls, file_name):
        return cls(Word2Vec.load(f"{file_name}.wordvectors"))
    
    def vectorize(self, text):
        words = text.split()
        vectors = [self.model.wv[word] for word in words if word in self.model.wv]
        if len(vectors) == 0:
            return np.zeros(self.model.vector_size)
        return np.mean(vectors, axis=0)
        
        
    def transform(self, series: pd.Series):     
        return np.array(series.apply(self.vectorize).tolist())
    
    @classmethod
    def load_navec(cls, navec_model):
        new_model = Word2Vec(vector_size=navec_model.pq.dim)
        weights = navec_model.pq.unpack()
        new_model.wv.add_vectors(
            list(navec_model.vocab.words),  
            list(weights)
        )
        return cls(new_model)
    
    @classmethod
    def load_rusvectores(cls, model_path: str, binary: bool):
        kv = KeyedVectors.load_word2vec_format(model_path, binary=binary)
        new_model = Word2Vec(vector_size=kv.vector_size)
        new_model.wv.add_vectors(kv.index_to_key, kv.vectors)
        return cls(new_model)
    
    
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression


class TestVectorizer:
    def __init__(self, vectorizer: VectorizerModel, df: pd.DataFrame):
        self.classifier = LogisticRegression(n_jobs=-1, max_iter=10000,)
        self.vectorizer = vectorizer
        self.X_train_vectors = None
        self.X_val_vectors =  None
        self.X_test_vectors =  None
        self.y_train = None
        self.y_val = None
        self.y_test = None
        self.set_data(df)
        
    def set_data(self, df):
        train_df, val_df, test_df = split_data(df)  
        self.X_train_vectors = self.vectorizer.transform(train_df['processed_full_text'])
        self.X_val_vectors =  self.vectorizer.transform(val_df['processed_full_text'])
        self.X_test_vectors =  self.vectorizer.transform(test_df['processed_full_text'])
        self.y_train = np.array(train_df["topic"].tolist())
        self.y_val = np.array(val_df["topic"].tolist())
        self.y_test = np.array(test_df["topic"].tolist())
        
    def __call__(self, *args, **kwargs):
        self.classifier.fit(self.X_train_vectors, self.y_train)
        
        y_pred_test = self.classifier.predict(self.X_test_vectors)
        acc_test = accuracy_score(self.y_test, y_pred_test)
        print(f"Accuracy на тесте: {acc_test:.4f}")
        
        # Оценка на валидационной выборке
        y_pred_val = self.classifier.predict(self.X_val_vectors)
        acc_val = accuracy_score(self.y_val, y_pred_val)
        print(f"Accuracy на валидации: {acc_val:.4f}")
        
from sklearn.model_selection import train_test_split

def split_data(df: pd.DataFrame):
    topic_counts = df['topic'].value_counts()
    min_samples = 5
    balanced_df = df[df['topic'].isin(topic_counts[topic_counts >= min_samples].index)]
    
    train_df, temp_df = train_test_split(
            balanced_df, test_size=0.4, stratify=balanced_df['topic'], random_state=RANDOM_STATE
        )
        
    val_df, test_df = train_test_split(
        temp_df, test_size=0.5, stratify=temp_df['topic'], random_state=RANDOM_STATE
    )
    return train_df, val_df, test_df
      
# train_df, val_df, test_df = split_data(split_df)  
        
    
    

In [7]:
import os.path
import pandas as pd
from corus import load_lenta
from sklearn.model_selection import train_test_split


if os.path.exists("../processed_balanced_text.csv") and os.path.exists("../processed_split_df.csv"):
    balanced_df = pd.read_csv("../processed_balanced_text.csv", encoding='utf-8')
    split_df = pd.read_csv("../processed_split_df.csv", encoding='utf-8')
elif os.path.exists("processed_balanced_text.csv") and os.path.exists("processed_split_df.csv"): 
    balanced_df = pd.read_csv("processed_balanced_text.csv", encoding='utf-8')
    split_df = pd.read_csv("processed_split_df.csv", encoding='utf-8')
else:
    raise "Фреймов нет :("
 
    
    

In [46]:
split_df

Unnamed: 0,topic,processed_full_text
0,Россия,премьер министр владимир путин открыть первое ...
1,Спорт,вица премьер виталий мутко добровольно покинут...
2,Россия,россия подвести предварительный итог праймериз...
3,Спорт,семикратный чемпион формула михаэль шумахер сп...
4,Интернет и СМИ,среда вечером состояться заседание совет дирек...
...,...,...
99995,Бизнес,санкт петербург июнь состояться второй российс...
99996,Ценности,американский актриса предприниматель джессика ...
99997,Интернет и СМИ,сервис микроблог снять ограничение количество ...
99998,Экономика,правительство израиль компания нашли способ до...


In [3]:
import pandas as pd
balanced_df = pd.read_csv("../processed_balanced_text.csv", encoding='utf-8')
split_df = pd.read_csv("../processed_split_df.csv", encoding='utf-8')

# df_balanced_cleaned = balanced_df.loc[~balanced_df['topic'].isin(['EMPTY', 'Библиотека', "Легпром"])].copy()
# df_split_cleaned = split_df.loc[~split_df['topic'].isin(['EMPTY', 'Библиотека', "Легпром"])].copy()


In [None]:
# Вывод логов, по необходимости :D 
import logging 
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

Теперь приступим к подбору параметров:

`sample` Этот параметр определяет, какие слова будут downsampled (то есть случайным образом удаляться из контекста с некоторой вероятностью). Чем меньше значение sample, тем меньше downsampling применяется к частым словам). Я буду использовать следующие заначения.
* `1e-5` (0.00001) - стандартное значение. Умеренный downsampling. Подходит для датасетов, где частые слова несут смысловую нагрузку. 
* `6e-5` (0.00006) - Более агрессивный downsampling. Подходит для датасетов с большим количеством стоп-слов. 
* `1e-3` (0.001) - Очень агрессивный downsampling. Подходит для очень больших датасетов, где частые слова являются шумом.

`window` Параметр window определяет, сколько слов вокруг целевого слова учитывается как контекст.
* Если выбрать слишком маленькое значение (например, 2), модель может не улавливать широкие семантические связи.

* Если выбрать слишком большое значение (например, 10), модель может начать учитывать слишком далекие слова, что ухудшит качество.

`vector_size` Этот параметр определяет размерность вектора слова.
* Размерность 300 — это стандартный выбор для большинства задач. Вектора такого размера достаточно, чтобы улавливать семантические связи между словами.

`negative` Параметр определяет количество негативных примеров для негативного сэмплирования. Негативное сэмплирование ускоряет обучение и улучшает качество модели, особенно для больших датасетов.
* Значение 10 — это хороший баланс между качеством и скоростью обучения.
* Если выбрать слишком маленькое значение (например, 5), модель может хуже улавливать семантические связи.
* Если выбрать слишком большое значение (например, 20), это увеличит вычислительную сложность.

`alpha` и `min_alpha`
* alpha — это начальная скорость обучения.
* min_alpha — это минимальная скорость обучения.
* Значение `0.025` для `alpha` — это стандартный выбор для Word2Vec. Оно позволяет модели быстро обучаться на начальных этапах.
* Значение `0.0001` для `min_alpha` гарантирует, что скорость обучения не станет слишком маленькой, что может замедлить обучение.

`sg` Параметр sg определяет, какой алгоритм использовать: Skip-Gram (`1`) или CBOW (`0`).
* Skip-Gram лучше работает на больших датасетах и лучше учитывает редкие слова.
* CBOW быстрее, но хуже работает с редкими словами.
* Для новостных данных, где важно учитывать редкие термины, Skip-Gram предпочтительнее.



In [89]:
w2v_params = {
'min_count': 10,
'window': 2,
'vector_size': 300,
'negative': 10,
'alpha': 0.03,
'min_alpha': 0.0007,
'sample': 6e-5,
'sg': 1,
'workers': 6,
'epochs': 20
}
VectorizerModel.create("1", split_df, w2v_params)

INFO - 15:57:51: collecting all words and their counts
INFO - 15:57:51: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 15:57:52: PROGRESS: at sentence #10000, processed 1240447 words, keeping 57035 word types
INFO - 15:57:52: PROGRESS: at sentence #20000, processed 2478015 words, keeping 80109 word types
INFO - 15:57:52: PROGRESS: at sentence #30000, processed 3720717 words, keeping 97910 word types
INFO - 15:57:52: PROGRESS: at sentence #40000, processed 4962698 words, keeping 112942 word types
INFO - 15:57:52: PROGRESS: at sentence #50000, processed 6204420 words, keeping 126284 word types
INFO - 15:57:52: PROGRESS: at sentence #60000, processed 7449986 words, keeping 138155 word types
INFO - 15:57:52: PROGRESS: at sentence #70000, processed 8687242 words, keeping 149407 word types
INFO - 15:57:53: PROGRESS: at sentence #80000, processed 9941927 words, keeping 159689 word types
INFO - 15:57:53: PROGRESS: at sentence #90000, processed 11188326 words, keeping 

In [96]:
# w2v_params = {
#     'min_count': 10,
#     'window': 6,
#     'vector_size': 300,
#     'negative': 10,
#     'alpha': 0.03,
#     'min_alpha': 0.0007,
#     'sample': 6e-5,
#     'sg': 1,
#     'workers': 6,
#     'epochs': 20
# }
# VectorizerModel.create("2", split_df, w2v_params)

INFO - 16:05:50: collecting all words and their counts
INFO - 16:05:50: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:05:50: PROGRESS: at sentence #10000, processed 1240447 words, keeping 57035 word types
INFO - 16:05:50: PROGRESS: at sentence #20000, processed 2478015 words, keeping 80109 word types
INFO - 16:05:50: PROGRESS: at sentence #30000, processed 3720717 words, keeping 97910 word types
INFO - 16:05:51: PROGRESS: at sentence #40000, processed 4962698 words, keeping 112942 word types
INFO - 16:05:51: PROGRESS: at sentence #50000, processed 6204420 words, keeping 126284 word types
INFO - 16:05:51: PROGRESS: at sentence #60000, processed 7449986 words, keeping 138155 word types
INFO - 16:05:51: PROGRESS: at sentence #70000, processed 8687242 words, keeping 149407 word types
INFO - 16:05:51: PROGRESS: at sentence #80000, processed 9941927 words, keeping 159689 word types
INFO - 16:05:51: PROGRESS: at sentence #90000, processed 11188326 words, keeping 

In [105]:
# w2v_params = {
#     'min_count': 10,
#     'window': 10,
#     'vector_size': 300,
#     'negative': 10,
#     'alpha': 0.03,
#     'min_alpha': 0.0007,
#     'sample': 6e-5,
#     'sg': 1,
#     'workers': 6,
#     'epochs': 20
# }
# VectorizerModel.create("2_w_10_e_20", split_df, w2v_params)

INFO - 16:38:58: collecting all words and their counts
INFO - 16:38:58: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:38:58: PROGRESS: at sentence #10000, processed 1240447 words, keeping 57035 word types
INFO - 16:38:58: PROGRESS: at sentence #20000, processed 2478015 words, keeping 80109 word types
INFO - 16:38:58: PROGRESS: at sentence #30000, processed 3720717 words, keeping 97910 word types
INFO - 16:38:58: PROGRESS: at sentence #40000, processed 4962698 words, keeping 112942 word types
INFO - 16:38:58: PROGRESS: at sentence #50000, processed 6204420 words, keeping 126284 word types
INFO - 16:38:58: PROGRESS: at sentence #60000, processed 7449986 words, keeping 138155 word types
INFO - 16:38:59: PROGRESS: at sentence #70000, processed 8687242 words, keeping 149407 word types
INFO - 16:38:59: PROGRESS: at sentence #80000, processed 9941927 words, keeping 159689 word types
INFO - 16:38:59: PROGRESS: at sentence #90000, processed 11188326 words, keeping 

<__main__.VectorizerModel at 0x7f4e8197bb20>

In [106]:
# w2v_params = {
#     'min_count': 10,
#     'window': 10,
#     'vector_size': 300,
#     'negative': 10,
#     'alpha': 0.03,
#     'min_alpha': 0.0007,
#     'sample': 6e-5,
#     'sg': 1,
#     'workers': 6,
#     'epochs': 10
# }
# VectorizerModel.create("2_w_10_e_10", split_df, w2v_params)

INFO - 16:52:49: collecting all words and their counts
INFO - 16:52:49: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:52:49: PROGRESS: at sentence #10000, processed 1240447 words, keeping 57035 word types
INFO - 16:52:49: PROGRESS: at sentence #20000, processed 2478015 words, keeping 80109 word types
INFO - 16:52:49: PROGRESS: at sentence #30000, processed 3720717 words, keeping 97910 word types
INFO - 16:52:50: PROGRESS: at sentence #40000, processed 4962698 words, keeping 112942 word types
INFO - 16:52:50: PROGRESS: at sentence #50000, processed 6204420 words, keeping 126284 word types
INFO - 16:52:50: PROGRESS: at sentence #60000, processed 7449986 words, keeping 138155 word types
INFO - 16:52:50: PROGRESS: at sentence #70000, processed 8687242 words, keeping 149407 word types
INFO - 16:52:50: PROGRESS: at sentence #80000, processed 9941927 words, keeping 159689 word types
INFO - 16:52:50: PROGRESS: at sentence #90000, processed 11188326 words, keeping 

<__main__.VectorizerModel at 0x7f4ec8d4a1a0>

In [99]:
# w2v_params = {
#     'min_count': 10,
#     'window': 6,
#     'vector_size': 300,
#     'negative': 10,
#     'alpha': 0.03,
#     'min_alpha': 0.0007,
#     'sample': 6e-5,
#     'sg': 1,
#     'workers': 6,
#     'epochs': 10
# }
# VectorizerModel.create("2_w_6_e_10", split_df, w2v_params)

INFO - 16:16:28: collecting all words and their counts
INFO - 16:16:28: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:16:28: PROGRESS: at sentence #10000, processed 1240447 words, keeping 57035 word types
INFO - 16:16:28: PROGRESS: at sentence #20000, processed 2478015 words, keeping 80109 word types
INFO - 16:16:28: PROGRESS: at sentence #30000, processed 3720717 words, keeping 97910 word types
INFO - 16:16:28: PROGRESS: at sentence #40000, processed 4962698 words, keeping 112942 word types
INFO - 16:16:29: PROGRESS: at sentence #50000, processed 6204420 words, keeping 126284 word types
INFO - 16:16:29: PROGRESS: at sentence #60000, processed 7449986 words, keeping 138155 word types
INFO - 16:16:29: PROGRESS: at sentence #70000, processed 8687242 words, keeping 149407 word types
INFO - 16:16:29: PROGRESS: at sentence #80000, processed 9941927 words, keeping 159689 word types
INFO - 16:16:29: PROGRESS: at sentence #90000, processed 11188326 words, keeping 

<__main__.VectorizerModel at 0x7f4eabe56050>

In [110]:
# w2v_params = {
#     'min_count': 10,          
#     'window': 5,              
#     'vector_size': 300,       
#     'negative': 10,           
#     'alpha': 0.025,           
#     'min_alpha': 0.0001,      
#     'sample': 1e-3,           
#     'sg': 1,                  
#     'workers': 6,             
#     'epochs': 10              
# }
# 
# VectorizerModel.create("3_s_1e-3_e_10", split_df, w2v_params)

INFO - 17:03:02: collecting all words and their counts
INFO - 17:03:02: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 17:03:02: PROGRESS: at sentence #10000, processed 1240447 words, keeping 57035 word types
INFO - 17:03:02: PROGRESS: at sentence #20000, processed 2478015 words, keeping 80109 word types
INFO - 17:03:03: PROGRESS: at sentence #30000, processed 3720717 words, keeping 97910 word types
INFO - 17:03:03: PROGRESS: at sentence #40000, processed 4962698 words, keeping 112942 word types
INFO - 17:03:03: PROGRESS: at sentence #50000, processed 6204420 words, keeping 126284 word types
INFO - 17:03:03: PROGRESS: at sentence #60000, processed 7449986 words, keeping 138155 word types
INFO - 17:03:03: PROGRESS: at sentence #70000, processed 8687242 words, keeping 149407 word types
INFO - 17:03:03: PROGRESS: at sentence #80000, processed 9941927 words, keeping 159689 word types
INFO - 17:03:03: PROGRESS: at sentence #90000, processed 11188326 words, keeping 

<__main__.VectorizerModel at 0x7f4ec8d48be0>

In [101]:
# w2v_params = {
#     'min_count': 10,          
#     'window': 5,              
#     'vector_size': 300,       
#     'negative': 10,           
#     'alpha': 0.025,           
#     'min_alpha': 0.0001,      
#     'sample': 1e-5,           
#     'sg': 1,                  
#     'workers': 6,             
#     'epochs': 10              
# }
# 
# VectorizerModel.create("3", split_df, w2v_params)

INFO - 16:22:41: collecting all words and their counts
INFO - 16:22:41: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:22:41: PROGRESS: at sentence #10000, processed 1240447 words, keeping 57035 word types
INFO - 16:22:41: PROGRESS: at sentence #20000, processed 2478015 words, keeping 80109 word types
INFO - 16:22:41: PROGRESS: at sentence #30000, processed 3720717 words, keeping 97910 word types
INFO - 16:22:41: PROGRESS: at sentence #40000, processed 4962698 words, keeping 112942 word types
INFO - 16:22:42: PROGRESS: at sentence #50000, processed 6204420 words, keeping 126284 word types
INFO - 16:22:42: PROGRESS: at sentence #60000, processed 7449986 words, keeping 138155 word types
INFO - 16:22:42: PROGRESS: at sentence #70000, processed 8687242 words, keeping 149407 word types
INFO - 16:22:42: PROGRESS: at sentence #80000, processed 9941927 words, keeping 159689 word types
INFO - 16:22:42: PROGRESS: at sentence #90000, processed 11188326 words, keeping 

<__main__.VectorizerModel at 0x7f4ef2600250>

In [103]:
# w2v_params = {
#     'min_count': 10,          
#     'window': 5,              
#     'vector_size': 300,       
#     'negative': 10,           
#     'alpha': 0.025,           
#     'min_alpha': 0.0001,      
#     'sample': 6e-5,           
#     'sg': 1,                  
#     'workers': 6,             
#     'epochs': 10              
# }
# 
# VectorizerModel.create("3_sample_6e-5", split_df, w2v_params)

INFO - 16:27:29: collecting all words and their counts
INFO - 16:27:29: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:27:29: PROGRESS: at sentence #10000, processed 1240447 words, keeping 57035 word types
INFO - 16:27:30: PROGRESS: at sentence #20000, processed 2478015 words, keeping 80109 word types
INFO - 16:27:30: PROGRESS: at sentence #30000, processed 3720717 words, keeping 97910 word types
INFO - 16:27:30: PROGRESS: at sentence #40000, processed 4962698 words, keeping 112942 word types
INFO - 16:27:30: PROGRESS: at sentence #50000, processed 6204420 words, keeping 126284 word types
INFO - 16:27:30: PROGRESS: at sentence #60000, processed 7449986 words, keeping 138155 word types
INFO - 16:27:30: PROGRESS: at sentence #70000, processed 8687242 words, keeping 149407 word types
INFO - 16:27:31: PROGRESS: at sentence #80000, processed 9941927 words, keeping 159689 word types
INFO - 16:27:31: PROGRESS: at sentence #90000, processed 11188326 words, keeping 

<__main__.VectorizerModel at 0x7f4eabe34520>

In [None]:
word2vec = VectorizerModel.load(1)

In [91]:
print(list(word2vec.model.wv.key_to_index.keys())[:100])

['год', 'россия', 'сообщать', 'человек', 'заявить', 'компания', 'российский', 'процент', 'время', 'президент', 'страна', 'слово', 'доллар', 'тысяча', 'сша', 'два', 'миллион', 'москва', 'получить', 'рубль', 'сообщить', 'представитель', 'глава', 'место', 'украина', 'результат', 'власть', 'являться', 'данным', 'находиться', 'город', 'работа', 'миллиард', 'министр', 'решение', 'отметить', 'агентство', 'американский', 'информация', 'день', 'сообщаться', 'суд', 'военный', 'лет', 'сайт', 'новость', 'дело', 'правительство', 'три', 'произойти', 'сотрудник', 'число', 'последний', 'служба', 'группа', 'проект', 'организация', 'первый', 'газета', 'область', 'новый', 'риа', 'рф', 'пресс', 'акция', 'задержать', 'сентябрь', 'провести', 'московский', 'территория', 'издание', 'полиция', 'летний', 'декабрь', 'против', 'самолёт', 'ссылка', 'связь', 'ноябрь', 'октябрь', 'центр', 'матч', 'сказать', 'январь', 'отношение', 'участие', 'лидер', 'партия', 'апрель', 'сторона', 'конец', 'система', 'июль', 'июнь', 

Обученная успешно нашла слова, которые семантически связаны с понятием "аэропорт". В списке присутствуют названия известных аэропортов (например, "внуково", "шереметьево", "домодедово"), а также слова, связанные с авиацией и воздушными перевозками ("рейс", "терминал", "авиагавань", "авиаузел").

In [92]:
word2vec.model.wv.most_similar('аэропорт')

[('внуково', 0.6216250061988831),
 ('рейс', 0.6200482249259949),
 ('шереметьево', 0.6151083707809448),
 ('домодедово', 0.6090074181556702),
 ('авиагавань', 0.6038758754730225),
 ('авиаузел', 0.6003633737564087),
 ('звартноц', 0.5918766260147095),
 ('кольцово', 0.5758230090141296),
 ('емельяново', 0.5725732445716858),
 ('терминал', 0.5708906054496765)]

Модель правильно определила, что слово "газета" является лишним в этом списке. 

In [95]:
print("Лишнее слово:", word2vec.model.wv.doesnt_match(["внуково", "терминал", "авиаузел", "газета"]))

Лишнее слово: газета


In [8]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression


class TestVectorizer:
    def __init__(self, vectorizer: VectorizerModel, df: pd.DataFrame):
        self.classifier = LogisticRegression(n_jobs=-1, max_iter=10000,)
        self.vectorizer = vectorizer
        self.X_train_vectors = None
        self.X_val_vectors =  None
        self.X_test_vectors =  None
        self.y_train = None
        self.y_val = None
        self.y_test = None
        self.set_data(df)
        
    def set_data(self, df):
        train_df, val_df, test_df = split_data(df)  
        self.X_train_vectors = self.vectorizer.transform(train_df['processed_full_text'])
        self.X_val_vectors =  self.vectorizer.transform(val_df['processed_full_text'])
        self.X_test_vectors =  self.vectorizer.transform(test_df['processed_full_text'])
        self.y_train = np.array(train_df["topic"].tolist())
        self.y_val = np.array(val_df["topic"].tolist())
        self.y_test = np.array(test_df["topic"].tolist())
        
    def __call__(self, *args, **kwargs):
        self.classifier.fit(self.X_train_vectors, self.y_train)
        
        y_pred_test = self.classifier.predict(self.X_test_vectors)
        acc_test = accuracy_score(self.y_test, y_pred_test)
        print(f"Accuracy на тесте: {acc_test:.4f}")
        
        # Оценка на валидационной выборке
        y_pred_val = self.classifier.predict(self.X_val_vectors)
        acc_val = accuracy_score(self.y_val, y_pred_val)
        print(f"Accuracy на валидации: {acc_val:.4f}")
        

In [98]:
# # 'min_count': 10,
# # 'window': 2,
# # 'vector_size': 300,
# # 'negative': 10,
# # 'alpha': 0.03,
# # 'min_alpha': 0.0007,
# # 'sample': 6e-5,
# # 'sg': 1,
# # 'workers': 6,
# # 'epochs': 20
# test = TestVectorizer(VectorizerModel.load(1), split_df)
# test()

INFO - 16:15:40: loading Word2Vec object from 1.wordvectors
INFO - 16:15:40: loading wv recursively from 1.wordvectors.wv.* with mmap=None
INFO - 16:15:40: loading vectors from 1.wordvectors.wv.vectors.npy with mmap=None
INFO - 16:15:40: loading syn1neg from 1.wordvectors.syn1neg.npy with mmap=None
INFO - 16:15:41: setting ignored attribute cum_table to None
INFO - 16:15:41: Word2Vec lifecycle event {'fname': '1.wordvectors', 'datetime': '2025-03-12T16:15:41.182087', 'gensim': '4.3.3', 'python': '3.10.12 (main, Feb  4 2025, 14:57:36) [GCC 11.4.0]', 'platform': 'Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}


Accuracy на тесте: 0.7885
Accuracy на валидации: 0.7931


In [97]:
# #     'min_count': 10,
# #     'window': 6,
# #     'vector_size': 300,
# #     'negative': 10,
# #     'alpha': 0.03,
# #     'min_alpha': 0.0007,
# #     'sample': 6e-5,
# #     'sg': 1,
# #     'workers': 6,
# #     'epochs': 20
# test = TestVectorizer(VectorizerModel.load(2), split_df)
# test()

INFO - 16:15:11: loading Word2Vec object from 2.wordvectors
INFO - 16:15:11: loading wv recursively from 2.wordvectors.wv.* with mmap=None
INFO - 16:15:11: loading vectors from 2.wordvectors.wv.vectors.npy with mmap=None
INFO - 16:15:11: loading syn1neg from 2.wordvectors.syn1neg.npy with mmap=None
INFO - 16:15:11: setting ignored attribute cum_table to None
INFO - 16:15:11: Word2Vec lifecycle event {'fname': '2.wordvectors', 'datetime': '2025-03-12T16:15:11.821474', 'gensim': '4.3.3', 'python': '3.10.12 (main, Feb  4 2025, 14:57:36) [GCC 11.4.0]', 'platform': 'Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}


Accuracy на тесте: 0.7963
Accuracy на валидации: 0.8010


In [100]:
# # 'min_count': 10,
# # 'window': 6,
# # 'vector_size': 300,
# # 'negative': 10,
# # 'alpha': 0.03,
# # 'min_alpha': 0.0007,
# # 'sample': 6e-5,
# # 'sg': 1,
# # 'workers': 6,
# # 'epochs': 10
# test = TestVectorizer(VectorizerModel.load("2_w_6_e_10"), split_df)
# test()

INFO - 16:21:05: loading Word2Vec object from 2_w_6_e_10.wordvectors
INFO - 16:21:05: loading wv recursively from 2_w_6_e_10.wordvectors.wv.* with mmap=None
INFO - 16:21:05: loading vectors from 2_w_6_e_10.wordvectors.wv.vectors.npy with mmap=None
INFO - 16:21:06: loading syn1neg from 2_w_6_e_10.wordvectors.syn1neg.npy with mmap=None
INFO - 16:21:06: setting ignored attribute cum_table to None
INFO - 16:21:06: Word2Vec lifecycle event {'fname': '2_w_6_e_10.wordvectors', 'datetime': '2025-03-12T16:21:06.281426', 'gensim': '4.3.3', 'python': '3.10.12 (main, Feb  4 2025, 14:57:36) [GCC 11.4.0]', 'platform': 'Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}


Accuracy на тесте: 0.7939
Accuracy на валидации: 0.7981


In [102]:
# # 'min_count': 10,          
# # 'window': 5,              
# # 'vector_size': 300,       
# # 'negative': 10,           
# # 'alpha': 0.025,           
# # 'min_alpha': 0.0001,      
# # 'sample': 1e-5,           
# # 'sg': 1,                  
# # 'workers': 6,             
# # 'epochs': 10  
# test = TestVectorizer(VectorizerModel.load("3"), split_df)
# test()

INFO - 16:24:52: loading Word2Vec object from 3.wordvectors
INFO - 16:24:52: loading wv recursively from 3.wordvectors.wv.* with mmap=None
INFO - 16:24:52: loading vectors from 3.wordvectors.wv.vectors.npy with mmap=None
INFO - 16:24:53: loading syn1neg from 3.wordvectors.syn1neg.npy with mmap=None
INFO - 16:24:53: setting ignored attribute cum_table to None
INFO - 16:24:53: Word2Vec lifecycle event {'fname': '3.wordvectors', 'datetime': '2025-03-12T16:24:53.304295', 'gensim': '4.3.3', 'python': '3.10.12 (main, Feb  4 2025, 14:57:36) [GCC 11.4.0]', 'platform': 'Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}


Accuracy на тесте: 0.7950
Accuracy на валидации: 0.7988


In [104]:
# # 'min_count': 10,          
# # 'window': 5,              
# # 'vector_size': 300,       
# # 'negative': 10,           
# # 'alpha': 0.025,           
# # 'min_alpha': 0.0001,      
# # 'sample': 6e-5,           
# # 'sg': 1,                  
# # 'workers': 6,             
# # 'epochs': 10   
# test = TestVectorizer(VectorizerModel.load("3_sample_6e-5"), split_df)
# test()

INFO - 16:30:56: loading Word2Vec object from 3_sample_6e-5.wordvectors
INFO - 16:30:56: loading wv recursively from 3_sample_6e-5.wordvectors.wv.* with mmap=None
INFO - 16:30:56: loading vectors from 3_sample_6e-5.wordvectors.wv.vectors.npy with mmap=None
INFO - 16:30:56: loading syn1neg from 3_sample_6e-5.wordvectors.syn1neg.npy with mmap=None
INFO - 16:30:56: setting ignored attribute cum_table to None
INFO - 16:30:56: Word2Vec lifecycle event {'fname': '3_sample_6e-5.wordvectors', 'datetime': '2025-03-12T16:30:56.918825', 'gensim': '4.3.3', 'python': '3.10.12 (main, Feb  4 2025, 14:57:36) [GCC 11.4.0]', 'platform': 'Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}


Accuracy на тесте: 0.7953
Accuracy на валидации: 0.7980


In [108]:
# # min_count': 10,
# # 'window': 10,
# # 'vector_size': 300,
# # 'negative': 10,
# # 'alpha': 0.03,
# # 'min_alpha': 0.0007,
# # 'sample': 6e-5,
# # 'sg': 1,
# # 'workers': 6,
# # 'epochs': 20
# test = TestVectorizer(VectorizerModel.load("2_w_10_e_20"), split_df)
# test()

INFO - 17:02:02: loading Word2Vec object from 2_w_10_e_20.wordvectors
INFO - 17:02:02: loading wv recursively from 2_w_10_e_20.wordvectors.wv.* with mmap=None
INFO - 17:02:02: loading vectors from 2_w_10_e_20.wordvectors.wv.vectors.npy with mmap=None
INFO - 17:02:02: loading syn1neg from 2_w_10_e_20.wordvectors.syn1neg.npy with mmap=None
INFO - 17:02:02: setting ignored attribute cum_table to None
INFO - 17:02:02: Word2Vec lifecycle event {'fname': '2_w_10_e_20.wordvectors', 'datetime': '2025-03-12T17:02:02.931329', 'gensim': '4.3.3', 'python': '3.10.12 (main, Feb  4 2025, 14:57:36) [GCC 11.4.0]', 'platform': 'Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}


Accuracy на тесте: 0.7945
Accuracy на валидации: 0.8005


In [109]:
# # 'min_count': 10,
# # 'window': 10,
# # 'vector_size': 300,
# # 'negative': 10,
# # 'alpha': 0.03,
# # 'min_alpha': 0.0007,
# # 'sample': 6e-5,
# # 'sg': 1,
# # 'workers': 6,
# # 'epochs': 10
# test = TestVectorizer(VectorizerModel.load("2_w_10_e_10"), split_df)
# test()

INFO - 17:02:32: loading Word2Vec object from 2_w_10_e_10.wordvectors
INFO - 17:02:32: loading wv recursively from 2_w_10_e_10.wordvectors.wv.* with mmap=None
INFO - 17:02:32: loading vectors from 2_w_10_e_10.wordvectors.wv.vectors.npy with mmap=None
INFO - 17:02:32: loading syn1neg from 2_w_10_e_10.wordvectors.syn1neg.npy with mmap=None
INFO - 17:02:33: setting ignored attribute cum_table to None
INFO - 17:02:33: Word2Vec lifecycle event {'fname': '2_w_10_e_10.wordvectors', 'datetime': '2025-03-12T17:02:33.146691', 'gensim': '4.3.3', 'python': '3.10.12 (main, Feb  4 2025, 14:57:36) [GCC 11.4.0]', 'platform': 'Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}


Accuracy на тесте: 0.7959
Accuracy на валидации: 0.8020


Сравнив модели при разных параметров

In [111]:
# # 'min_count': 10,          
# # 'window': 5,              
# # 'vector_size': 300,       
# # 'negative': 10,           
# # 'alpha': 0.025,           
# # 'min_alpha': 0.0001,      
# # 'sample': 1e-3,           
# # 'sg': 1,                  
# # 'workers': 6,             
# # 'epochs': 10  
# test = TestVectorizer(VectorizerModel.load("3_s_1e-3_e_10"), split_df)
# test()

INFO - 17:07:34: loading Word2Vec object from 3_s_1e-3_e_10.wordvectors
INFO - 17:07:34: loading wv recursively from 3_s_1e-3_e_10.wordvectors.wv.* with mmap=None
INFO - 17:07:34: loading vectors from 3_s_1e-3_e_10.wordvectors.wv.vectors.npy with mmap=None
INFO - 17:07:34: loading syn1neg from 3_s_1e-3_e_10.wordvectors.syn1neg.npy with mmap=None
INFO - 17:07:34: setting ignored attribute cum_table to None
INFO - 17:07:34: Word2Vec lifecycle event {'fname': '3_s_1e-3_e_10.wordvectors', 'datetime': '2025-03-12T17:07:34.552385', 'gensim': '4.3.3', 'python': '3.10.12 (main, Feb  4 2025, 14:57:36) [GCC 11.4.0]', 'platform': 'Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}


Accuracy на тесте: 0.7884
Accuracy на валидации: 0.7945


Влияние параметра `window`
* При window=2 Accuracy на тесте: 0.7885, на валидации: 0.7931.
* При window=5 Accuracy на тесте: 0.7950, на валидации: 0.7988.
* При window=6 Accuracy на тесте: 0.7963, на валидации: 0.8010.
* При window=10 Accuracy на тесте: 0.7959, на валидации: 0.8020.
* window=10 качество незначительно улучшается (Accuracy на валидации: 0.8020), но это увеличивает время обучения.

Влияние параметра epochs
* величение количества эпох с `10` до `20` немного улучшает Accuracy, но незначительно.

Влияние параметра sample
* Разница между sample=6e-5 и sample=1e-5 минимальна. Оба значения работают хорошо.
* При sample=1e-3 качество ухудшается, так как агрессивный downsampling удаляет слишком много информации.

Влияние параметра alpha
* Разница между alpha=0.03 и alpha=0.025 незначительна. Оба значения работают хорошо.

In [8]:
fast_params = {
    'min_count': 10,          # Игнорировать редкие слова
    'window': 5,              # Оптимальный контекст для баланса качества и скорости
    'vector_size': 300,       # Стандартная размерность
    'negative': 10,           # Оптимальное количество негативных примеров
    'alpha': 0.025,           # Стандартная скорость обучения
    'min_alpha': 0.0001,      # Минимальная скорость обучения
    'sample': 1e-5,           # Стандартный порог для downsampling
    'sg': 1,                  # Использовать Skip-Gram для лучшего качества
    'workers': 6,             # Максимальное количество потоков
    'epochs': 10              # Меньше эпох для ускорения обучения
}

VectorizerModel.create("fast_params", split_df, fast_params)

INFO - 18:41:14: collecting all words and their counts
INFO - 18:41:14: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 18:41:14: PROGRESS: at sentence #10000, processed 1240224 words, keeping 56916 word types
INFO - 18:41:15: PROGRESS: at sentence #20000, processed 2480403 words, keeping 80187 word types
INFO - 18:41:15: PROGRESS: at sentence #30000, processed 3724149 words, keeping 97936 word types
INFO - 18:41:15: PROGRESS: at sentence #40000, processed 4965020 words, keeping 112758 word types
INFO - 18:41:15: PROGRESS: at sentence #50000, processed 6209957 words, keeping 126140 word types
INFO - 18:41:15: PROGRESS: at sentence #60000, processed 7453163 words, keeping 137971 word types
INFO - 18:41:15: PROGRESS: at sentence #70000, processed 8689990 words, keeping 148948 word types
INFO - 18:41:16: PROGRESS: at sentence #80000, processed 9930414 words, keeping 159272 word types
INFO - 18:41:16: PROGRESS: at sentence #90000, processed 11173588 words, keeping 

<__main__.VectorizerModel at 0x7fc6febd6410>

In [10]:
VectorizerModel.create("fast_params_balanced", balanced_df, fast_params)

INFO - 18:44:06: collecting all words and their counts
INFO - 18:44:06: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 18:44:06: PROGRESS: at sentence #10000, processed 1244344 words, keeping 57351 word types
INFO - 18:44:06: PROGRESS: at sentence #20000, processed 2483676 words, keeping 80510 word types
INFO - 18:44:06: PROGRESS: at sentence #30000, processed 3720386 words, keeping 97675 word types
INFO - 18:44:06: PROGRESS: at sentence #40000, processed 4956014 words, keeping 112169 word types
INFO - 18:44:07: PROGRESS: at sentence #50000, processed 6203671 words, keeping 125198 word types
INFO - 18:44:07: PROGRESS: at sentence #60000, processed 7445392 words, keeping 136681 word types
INFO - 18:44:07: PROGRESS: at sentence #70000, processed 8698515 words, keeping 147294 word types
INFO - 18:44:07: PROGRESS: at sentence #80000, processed 9946063 words, keeping 157102 word types
INFO - 18:44:07: PROGRESS: at sentence #90000, processed 11180570 words, keeping 

<__main__.VectorizerModel at 0x7fc702fadc30>

In [7]:
# heavy_params = {
#     'min_count': 10,          # Игнорировать редкие слова
#     'window': 6,              # Широкий контекст для лучшего улавливания семантики
#     'vector_size': 300,       # Стандартная размерность
#     'negative': 10,           # Оптимальное количество негативных примеров
#     'alpha': 0.03,            # Немного выше скорость обучения
#     'min_alpha': 0.0007,      # Минимальная скорость обучения
#     'sample': 1e-5,           # Стандартный порог для downsampling
#     'sg': 1,                  # Использовать Skip-Gram для лучшего качества
#     'workers': 6,             # Максимальное количество потоков
#     'epochs': 20              # Больше эпох для лучшего обучения
# }
# VectorizerModel.create("heavy_params", split_df, heavy_params)

INFO - 17:50:49: collecting all words and their counts
INFO - 17:50:49: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 17:50:49: PROGRESS: at sentence #10000, processed 1240447 words, keeping 57035 word types
INFO - 17:50:49: PROGRESS: at sentence #20000, processed 2478015 words, keeping 80109 word types
INFO - 17:50:50: PROGRESS: at sentence #30000, processed 3720717 words, keeping 97910 word types
INFO - 17:50:50: PROGRESS: at sentence #40000, processed 4962698 words, keeping 112942 word types
INFO - 17:50:50: PROGRESS: at sentence #50000, processed 6204420 words, keeping 126284 word types
INFO - 17:50:50: PROGRESS: at sentence #60000, processed 7449986 words, keeping 138155 word types
INFO - 17:50:50: PROGRESS: at sentence #70000, processed 8687242 words, keeping 149407 word types
INFO - 17:50:50: PROGRESS: at sentence #80000, processed 9941927 words, keeping 159689 word types
INFO - 17:50:51: PROGRESS: at sentence #90000, processed 11188326 words, keeping 

<__main__.VectorizerModel at 0x7f7b59411c30>

В векторизации от w2c менее заметна разница между обычным df и равеномерно распределенным.

In [12]:
test = TestVectorizer(VectorizerModel.load("fast_params"), split_df)
test()
test = TestVectorizer(VectorizerModel.load("fast_params_balanced"), split_df)
test()
test = None

INFO - 18:46:36: loading Word2Vec object from fast_params.wordvectors
INFO - 18:46:36: loading wv recursively from fast_params.wordvectors.wv.* with mmap=None
INFO - 18:46:36: loading vectors from fast_params.wordvectors.wv.vectors.npy with mmap=None
INFO - 18:46:36: loading syn1neg from fast_params.wordvectors.syn1neg.npy with mmap=None
INFO - 18:46:36: setting ignored attribute cum_table to None
INFO - 18:46:36: Word2Vec lifecycle event {'fname': 'fast_params.wordvectors', 'datetime': '2025-03-13T18:46:36.689500', 'gensim': '4.3.3', 'python': '3.10.12 (main, Feb  4 2025, 14:57:36) [GCC 11.4.0]', 'platform': 'Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}
INFO - 18:47:04: loading Word2Vec object from fast_params_balanced.wordvectors
INFO - 18:47:04: loading wv recursively from fast_params_balanced.wordvectors.wv.* with mmap=None
INFO - 18:47:04: loading vectors from fast_params_balanced.wordvectors.wv.vectors.npy with mmap=None


Accuracy на тесте: 0.7977
Accuracy на валидации: 0.7985


INFO - 18:47:04: loading syn1neg from fast_params_balanced.wordvectors.syn1neg.npy with mmap=None
INFO - 18:47:04: setting ignored attribute cum_table to None
INFO - 18:47:05: Word2Vec lifecycle event {'fname': 'fast_params_balanced.wordvectors', 'datetime': '2025-03-13T18:47:05.051629', 'gensim': '4.3.3', 'python': '3.10.12 (main, Feb  4 2025, 14:57:36) [GCC 11.4.0]', 'platform': 'Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}


Accuracy на тесте: 0.7997
Accuracy на валидации: 0.7991


In [8]:
# test = TestVectorizer(VectorizerModel.load("fast_params"), split_df)
# test()
# test = TestVectorizer(VectorizerModel.load("heavy_params"), split_df)
# test()

INFO - 17:55:12: loading Word2Vec object from fast_params.wordvectors
INFO - 17:55:12: loading wv recursively from fast_params.wordvectors.wv.* with mmap=None
INFO - 17:55:12: loading vectors from fast_params.wordvectors.wv.vectors.npy with mmap=None
INFO - 17:55:12: loading syn1neg from fast_params.wordvectors.syn1neg.npy with mmap=None
INFO - 17:55:12: setting ignored attribute cum_table to None
INFO - 17:55:12: Word2Vec lifecycle event {'fname': 'fast_params.wordvectors', 'datetime': '2025-03-12T17:55:12.563339', 'gensim': '4.3.3', 'python': '3.10.12 (main, Feb  4 2025, 14:57:36) [GCC 11.4.0]', 'platform': 'Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}
INFO - 17:55:40: loading Word2Vec object from heavy_params.wordvectors
INFO - 17:55:40: loading wv recursively from heavy_params.wordvectors.wv.* with mmap=None
INFO - 17:55:40: loading vectors from heavy_params.wordvectors.wv.vectors.npy with mmap=None


Accuracy на тесте: 0.7938
Accuracy на валидации: 0.7980


INFO - 17:55:41: loading syn1neg from heavy_params.wordvectors.syn1neg.npy with mmap=None
INFO - 17:55:41: setting ignored attribute cum_table to None
INFO - 17:55:41: Word2Vec lifecycle event {'fname': 'heavy_params.wordvectors', 'datetime': '2025-03-12T17:55:41.416509', 'gensim': '4.3.3', 'python': '3.10.12 (main, Feb  4 2025, 14:57:36) [GCC 11.4.0]', 'platform': 'Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}


Accuracy на тесте: 0.7943
Accuracy на валидации: 0.7992


In [14]:
# info = api.info()
# for model_name, model_data in info['models'].items():
#     print(f"Название: {model_name}")
#     print(f"Описание: {model_data['description']}")
#     print(f"Размер: {model_data['file_size'] / (1024 * 1024):.2f} MB")
#     print("-" * 50)

Название: fasttext-wiki-news-subwords-300
Описание: 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).
Размер: 958.45 MB
--------------------------------------------------
Название: conceptnet-numberbatch-17-06-300
Описание: ConceptNet Numberbatch consists of state-of-the-art semantic vectors (also known as word embeddings) that can be used directly as a representation of word meanings or as a starting point for further machine learning. ConceptNet Numberbatch is part of the ConceptNet open data project. ConceptNet provides lots of ways to compute with word meanings, one of which is word embeddings. ConceptNet Numberbatch is a snapshot of just the word embeddings. It is built using an ensemble that combines data from ConceptNet, word2vec, GloVe, and OpenSubtitles 2016, using a variation on retrofitting.
Размер: 1168.73 MB
--------------------------------------------------
Название: word2vec-ruscorpora-300
Описание: Word2vec C

KeyError: 'file_size'

# navec

In [24]:
!pip install navec

Defaulting to user installation because normal site-packages is not writeable


In [30]:
%%capture
!wget https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar

In [13]:
from navec import Navec

path = 'navec_news_v1_1B_250K_300d_100q.tar'
navec = Navec.load(path)

navec_w2v = VectorizerModel.load_navec(navec)

INFO - 18:48:01: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.025>', 'datetime': '2025-03-13T18:48:01.152884', 'gensim': '4.3.3', 'python': '3.10.12 (main, Feb  4 2025, 14:57:36) [GCC 11.4.0]', 'platform': 'Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'created'}


In [68]:
print(list(navec_w2v.model.wv.key_to_index.keys())[:100])

['a', 'a-gps', 'a-ha', 'a-one', 'aa', 'aaa', 'aac', 'aacute', 'aad', 'aap', 'aar', 'aaron', 'aaronson', 'aavst', 'ab', 'aba', 'abar', 'abb', 'abba', 'abbas', 'abbey', 'abbott', 'abbvie', 'abbyy', 'abc', 'abcnews', 'abd', 'abdallah', 'abdel', 'abdul', 'abdullah', 'abe', 'abell', 'abendblatt', 'aber', 'abercrombie', 'aberdeen', 'abh', 'abi', 'able', 'abn', 'about', 'above', 'abraham', 'abrams', 'abs', 'absolut', 'absolute', 'absolutely', 'abt', 'abu', 'abudhabigp', 'abuse', 'abw', 'ac', 'academi', 'academia', 'academic', 'academy', 'acb', 'acc', 'accent', 'accenture', 'accept', 'access', 'accessories', 'accident', 'accor', 'accord', 'according', 'account', 'accountability', 'accred', 'accredit', 'accreditation', 'accuweather', 'ace', 'acea', 'acer', 'achievement', 'aci', 'aclu', 'acm', 'acmg', 'acorn', 'acoustic', 'acqua', 'acquisition', 'acrobat', 'acronis', 'across', 'acs', 'act', 'acta', 'actavis', 'action', 'actions', 'active', 'activision', 'activity']


In [48]:
navec_w2v.model.wv["аэропорт"]

array([-0.38157547,  0.51878476, -0.34328753, -0.10828243,  0.6183287 ,
        0.16474381,  0.2245954 , -0.45450312, -0.19402957, -0.46336433,
        0.22525619,  0.9330694 ,  0.4592613 , -0.13126236, -0.12389991,
       -0.13915254, -0.8444514 , -0.16262615, -0.2841552 , -0.10427912,
        0.10903299, -0.56793094, -0.29993477,  0.05472692,  0.07605194,
        0.02982216,  0.07088428,  0.11505795, -0.22585814,  0.6455059 ,
       -0.56587124, -0.5936461 , -0.28247967,  0.13288173, -0.11127474,
       -0.07478868, -0.06949382,  0.1146058 , -0.64657676,  0.31568807,
       -0.72183466, -0.01417023,  0.06757524,  0.23341271,  0.60565054,
        0.19690683, -0.41737285, -0.3120327 , -0.85508144,  0.3571387 ,
       -0.29469323, -0.15486374, -0.20224832,  0.7035065 , -0.10825375,
       -0.09528677, -0.76112145, -0.2956669 , -0.01828136,  0.05525805,
       -0.13418178, -0.11771767,  0.27413142, -0.02672392, -0.06228135,
        0.26352513, -0.37912372, -0.01786848, -0.4218956 ,  0.34

In [50]:
navec_w2v.model.wv.most_similar('аэропорт')

  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]


[('аэропорта', 0.7244567275047302),
 ('аэропорту', 0.6481307148933411),
 ('шереметьево', 0.612114429473877),
 ('пулково', 0.5924555659294128),
 ('домодедово', 0.5859100818634033),
 ('внуково', 0.5831590890884399),
 ('борисполь', 0.5510309934616089),
 ('терминал', 0.5296798944473267),
 ('донецкий', 0.5085664391517639),
 ('толмачево', 0.4983980059623718)]

In [56]:
zero_vector_words = [
    navec_w2v.model.wv.index_to_key[i] 
    for i, vector in enumerate(navec_w2v.model.wv.vectors) 
    if np.all(vector == 0)
]
print(f"Слова с нулевыми векторами: {zero_vector_words}")

Слова с нулевыми векторами: ['<pad>']


In [52]:
print("Лишнее слово:", navec_w2v.model.wv.doesnt_match(["внуково", "терминал", "авиаузел", "газета"]))

Лишнее слово: газета


In [58]:
test = TestVectorizer(navec_w2v, split_df)
test()

Accuracy на тесте: 0.7790
Accuracy на валидации: 0.7821


# RusVectōrēs

In [60]:
!wget https://vectors.nlpl.eu/repository/20/184.zip

9712.44s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


--2025-03-12 20:31:47--  https://vectors.nlpl.eu/repository/20/184.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.200, 2001:700:112::200
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.200|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 640196018 (611M) [application/zip]
Saving to: '184.zip.1'

184.zip.1             4%[                    ]  26.93M  3.64MB/s    eta 2m 56s ^C


9726.00s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


mkdir: cannot create directory 'rusvectores': File exists


9731.14s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?


9736.31s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


/bin/bash: line 1: unzip: command not found


Распаковываем архив в отдельную папку

In [62]:
import zipfile

zip_path = '184.zip'
extract_dir = 'rusvectores'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"Архив распакован в папку: {extract_dir}")

Архив распакован в папку: rusvectores


In [9]:
model_path = 'rusvectores/model.txt'
rusvectores_w2v = VectorizerModel.load_rusvectores(model_path, False)

Слова в модели `rusvectores` выглядят как `'xxxxxx_NUM'`, `'год_NOUN'`, `'россия_PROPN'` из-за того, что эта модель использует морфологическую разметку (или лемматизацию) и части речи (Part-of-Speech, POS) для каждого слова. 

In [69]:
print(list(rusvectores_w2v.model.wv.key_to_index.keys())[:100])

['xxxxxx_NUM', 'год_NOUN', 'россия_PROPN', 'сообщать_VERB', 'xxxxxxxx_NUM', 'xxxxxxx_NUM', 'российский_ADJ', 'мочь_VERB', 'человек_NOUN', 'заявлять_VERB', 'дело_NOUN', 'новый_ADJ', 'сша_PROPN', 'страна_NOUN', 'компания_NOUN', 'рубль_NOUN', 'также_ADV', 'один_NUM', 'время_NOUN', 'президент_NOUN', 'становиться_VERB', 'слово_NOUN', 'москва_PROPN', 'два_NUM', 'отмечать_VERB', 'первый_ADJ', 'суд_NOUN', 'глава_NOUN', 'получать_VERB', 'господин_NOUN', 'миллион_NOUN', 'область_NOUN', 'принимать_VERB', 'тысяча_NOUN', 'данные_NOUN', 'представитель_NOUN', 'решение_NOUN', 'проходить_VERB', 'ранее_ADV', 'украина_PROPN', 'проводить_VERB', 'более_ADV', 'уже_ADV', 'город_NOUN', 'говорить_VERB', 'еще_ADV', 'работа_NOUN', 'быть_VERB', 'миллиард_NOUN', 'результат_NOUN', 'приводить_VERB', 'вопрос_NOUN', 'отношение_NOUN', 'день_NOUN', 'должен_ADJ', 'место_NOUN', 'сирия_PROPN', 'погибать_VERB', 'напоминать_VERB', 'сегодня_ADV', 'другой_ADJ', 'материал_NOUN', 'сказать_VERB', 'происходить_VERB', 'новость_PROP

Работает :>

In [70]:
rusvectores_w2v.model.wv.most_similar('аэропорт_NOUN')

[('внуково_PROPN', 0.6604336500167847),
 ('домодедово_PROPN', 0.6543232202529907),
 ('шереметьево_PROPN', 0.5886697769165039),
 ('внуково_ADV', 0.586722731590271),
 ('внуково::светлана_PROPN', 0.5557174682617188),
 ('эль-найроб_PROPN', 0.5439022183418274),
 ('пулково_ADJ', 0.5373826622962952),
 ('брюсселязавершать_VERB', 0.5371927618980408),
 ('руасси_PROPN', 0.5350612998008728),
 ('аэропорт_PROPN', 0.5350168943405151)]

In [72]:
print("Лишнее слово:", rusvectores_w2v.model.wv.doesnt_match(["внуково_PROPN", "домодедово_PROPN", "пулково_ADJ", "бывший_ADJ"]))

Лишнее слово: бывший_ADJ


In [None]:
!python3 -m spacy download ru_core_news_lg

In [12]:
# logging.getLogger().handlers.clear()
processor = TextProcessingRusvect(batch_size=200, n_threads=4)
rusvectores_df = split_df.copy()
rusvectores_df["processed_full_text"] = processor.process_texts(rusvectores_df["processed_full_text"])
rusvectores_df.drop(columns=["title", "text", "processed_text", "processed_title"], inplace=True)
rusvectores_df.to_csv("rusvectores_df.csv", index=False, encoding='utf-8')


✅ Используется GPU


Processing batches:   0%|          | 0/500 [00:00<?, ?it/s]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


Processing batches:   0%|          | 0/500 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [12]:
rusvectores_df = pd.read_csv("rusvectores_df.csv", encoding='utf-8')

In [12]:
test = TestVectorizer(rusvectores_w2v, rusvectores_df)
test()

Accuracy на тесте: 0.7693
Accuracy на валидации: 0.7746


# Взвешиванием через tf-idf

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer( 
        ngram_range=(1, 3),
        min_df=5,
        max_df=0.9,
        sublinear_tf=True)
tfidf_vectorizer = tfidf.fit_transform(split_df["processed_full_text"])
tfidf_vocab = tfidf.vocabulary_

In [33]:
w2c_model = VectorizerModel.load("fast_params")

In [34]:
def text_to_weighted_vector(text):
    words = text.split()
    vectors = []
    weights = []
    for word in words:
        if word in w2c_model.model.wv and word in tfidf_vocab:
            # Получаем вектор слова и его TF-IDF вес
            word_vector = w2c_model.vectorize(word)
            word_weight = tfidf.idf_[tfidf_vocab[word]]
            vectors.append(word_vector)
            weights.append(word_weight)
    if len(vectors) > 0:
        return np.average(vectors, axis=0, weights=weights)
    else:
        return np.zeros(w2c_model.model.vector_size)

# text_vectors = [text_to_weighted_vector(text) for text in split_df["processed_full_text"]]

split_df_norm = split_df.copy()
split_df_norm["text_vectors"] = split_df_norm["processed_full_text"].apply(text_to_weighted_vector)

In [35]:
train_df, val_df, test_df = split_data(split_df_norm)  
X_train_vectors = np.vstack(train_df['text_vectors'].tolist())
X_val_vectors = np.vstack(val_df['text_vectors'].tolist())
X_test_vectors = np.vstack(test_df['text_vectors'].tolist())
y_train = np.array(train_df["topic"].tolist())
y_val = np.array(val_df["topic"].tolist())
y_test = np.array(test_df["topic"].tolist())

In [32]:
print(X_train_vectors)

[[-0.16761111  0.04369018  0.10045893 ... -0.19648277  0.04295006
  -0.03886882]
 [-0.0601474   0.06594129  0.23904165 ... -0.17005441  0.06593797
  -0.04588061]
 [-0.00517459  0.01103739  0.05744386 ... -0.19425408  0.04188257
   0.10642248]
 ...
 [-0.06068933  0.12420527  0.27093979 ... -0.22692545  0.15803782
   0.01502091]
 [ 0.009172    0.08879686  0.17886732 ... -0.24578626 -0.0281915
   0.08612367]
 [ 0.1036958   0.13253347  0.13338703 ... -0.15356072  0.03953877
   0.01137709]]


In [30]:
X_train_vectors.shape

(60000, 300)

In [36]:
classifier = LogisticRegression(n_jobs=-1, max_iter=10000,)
classifier.fit(X_train_vectors, y_train)
y_pred_test = classifier.predict(X_test_vectors)

acc_test = accuracy_score(y_test, y_pred_test)
print(f"Accuracy на тесте: {acc_test:.4f}")

# Оценка на валидационной выборке
y_pred_val = classifier.predict(X_val_vectors)
acc_val = accuracy_score(y_val, y_pred_val)
print(f"Accuracy на валидации: {acc_val:.4f}")

Accuracy на тесте: 0.7965
Accuracy на валидации: 0.7970


### Результаты

1. **Модель `fast_params` (без взвешивания):**
   - Accuracy на тесте: **0.7965**
   - Accuracy на валидации: **0.7933**

2. **Модель `fast_params` (со взвешиванием TF-IDF):**
   - Accuracy на тесте: **0.7965**
   - Accuracy на валидации: **0.7970**

3. **Модель `navec`:**
   - Accuracy на тесте: **0.7790**
   - Accuracy на валидации: **0.7821**

4. **Модель `rusvectores`:**
   - Accuracy на тесте: **0.7693**
   - Accuracy на валидации: **0.7746**

---
Взвешивание TF-IDF практически не повлияло на качество модели. Это может означать, что:
* Веса TF-IDF не добавляют значимой информации.
* Модель `fast_params` уже хорошо справляется с задачей, и дополнительное взвешивание не улучшает результаты.

Модель `fast_params` показала наилучшие результаты. Это связано с тем, что:
* Параметры `fast_params` подобраны для датасета.

### Итоговые выводы

1. **`fast_params` — лучшая модель**, показала наивысшее качество.
2. **Взвешивание TF-IDF** не дало значимого улучшения.
3. **`navec` и `rusvectores`** показали более низкое качество в текущей задаче.
4. **Дальнейшие улучшения** могут быть достигнуты за счет экспериментов с другими моделями, методами взвешивания и можно попробовать другие способы обработки данны.