In [None]:
import os
import sys
import time
import pickle

In [39]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from dotenv import load_dotenv


import torch
import torch.nn as nn

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer

from cuml.preprocessing import MinMaxScaler as CumlMinMaxScaler
from cuml.feature_extraction.text import TfidfVectorizer as CumlTfidfVectorizer
import cupy as cp
import cudf
from scipy.sparse import csr_matrix
from scipy.stats import entropy

from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel

In [None]:
# –ó–∞–≥—Ä—É–∑–∫–∞ –ø–µ—Ä–µ–º–µ–Ω–Ω—ã—Ö –æ–∫—Ä—É–∂–µ–Ω–∏—è –∏–∑ —Ñ–∞–π–ª–∞ .env
load_dotenv()

In [41]:
# –ü–æ–ª—É—á–µ–Ω–∏–µ —Ç–µ–∫—É—â–µ–π –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏–∏
current_dir = os.getcwd()
# –ü–æ–ª—É—á–µ–Ω–∏–µ –∫–æ—Ä–Ω–µ–≤–æ–π –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏–∏ –ø—Ä–æ–µ–∫—Ç–∞
project_root = os.path.dirname(os.path.dirname(current_dir))

In [42]:
# –î–æ–±–∞–≤–ª–µ–Ω–∏–µ –∫–æ—Ä–Ω–µ–≤–æ–π –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏–∏ –ø—Ä–æ–µ–∫—Ç–∞ –≤ sys.path –¥–ª—è –∏–º–ø–æ—Ä—Ç–∞ –º–æ–¥—É–ª–µ–π
if project_root not in sys.path:
    sys.path.append(project_root)

In [43]:
# –ü—É—Ç–∏ –∫ —Ñ–∞–π–ª–∞–º —Å –∏—Å—Ö–æ–¥–Ω—ã–º–∏ –¥–∞–Ω–Ω—ã–º–∏
df_raw_json_path = os.path.join(project_root, 'data', 'raw', 'steam_games_data.json')
df_raw_csv_path = os.path.join(project_root, 'data', 'raw', 'steam_games_data.csv')

In [44]:
# –ü—É—Ç–∏ –∫ —Ñ–∞–π–ª–∞–º —Å –æ–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–º–∏ –¥–∞–Ω–Ω—ã–º–∏
df_processed_json_path = os.path.join(project_root, 'data', 'processed', 'steam_games_data.json')
df_processed_csv_path = os.path.join(project_root, 'data', 'processed', 'steam_games_data.csv')

In [45]:
# –ü—É—Ç–∏ –∫ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞–º param grid
file_lda = "manual_coherence_results_lda.csv"
file_nmf = "manual_coherence_results_nmf.csv"

In [46]:
# –ü—É—Ç—å –∫ –∏—Ç–æ–≥–æ–≤–æ–π –º–æ–¥–µ–ª–∏
model_path = "best_model_manual_coherence.pkl"

In [None]:
# –û–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ —É—Å—Ç—Ä–æ–π—Å—Ç–≤–∞ Torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚öôÔ∏è –ò—Å–ø–æ–ª—å–∑—É–µ–º–æ–µ —É—Å—Ç—Ä–æ–π—Å—Ç–≤–æ: {device}")

In [None]:
# –ó–∞–≥—Ä—É–∑–∫–∞ –æ–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
print("üîÑ –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö...")
df = pd.read_json(df_processed_json_path)
print("‚úÖ –î–∞–Ω–Ω—ã–µ –∑–∞–≥—Ä—É–∂–µ–Ω—ã.")

In [49]:
def reduce_dataset(df, percentage=0.1):
    """–£–º–µ–Ω—å—à–∞–µ—Ç —Ä–∞–∑–º–µ—Ä –¥–∞—Ç–∞—Å–µ—Ç–∞ –¥–æ —É–∫–∞–∑–∞–Ω–Ω–æ–≥–æ –ø—Ä–æ—Ü–µ–Ω—Ç–∞.

    –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
        df (pd.DataFrame): –ò—Å—Ö–æ–¥–Ω—ã–π –¥–∞—Ç–∞—Å–µ—Ç.
        percentage (float): –ü—Ä–æ—Ü–µ–Ω—Ç —Ä–∞–∑–º–µ—Ä–∞ –¥–∞—Ç–∞—Å–µ—Ç–∞, –∫–æ—Ç–æ—Ä—ã–π –Ω—É–∂–Ω–æ –æ—Å—Ç–∞–≤–∏—Ç—å (–æ—Ç 0 –¥–æ 1).

    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
        pd.DataFrame: –£–º–µ–Ω—å—à–µ–Ω–Ω—ã–π –¥–∞—Ç–∞—Å–µ—Ç.

    –í—ã–∑—ã–≤–∞–µ—Ç ValueError, –µ—Å–ª–∏ percentage –Ω–µ –Ω–∞—Ö–æ–¥–∏—Ç—Å—è –≤ –¥–∏–∞–ø–∞–∑–æ–Ω–µ [0, 1].
    """
    if not 0 <= percentage <= 1:
        raise ValueError("‚ùå –ü—Ä–æ—Ü–µ–Ω—Ç –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å –≤ –¥–∏–∞–ø–∞–∑–æ–Ω–µ –æ—Ç 0 –¥–æ 1")

    print(f"üìâ –£–º–µ–Ω—å—à–µ–Ω–∏–µ –¥–∞—Ç–∞—Å–µ—Ç–∞ –¥–æ {percentage * 100}%...")
    df_sorted = df.sort_values(by='estimated_owners', ascending=False)
    num_rows = int(len(df_sorted) * percentage)
    reduced_df = df_sorted.head(num_rows)
    print(f"‚úÖ –î–∞—Ç–∞—Å–µ—Ç —É–º–µ–Ω—å—à–µ–Ω –¥–æ {len(reduced_df)} —Å—Ç—Ä–æ–∫.")
    return reduced_df

In [50]:
#df = reduce_dataset(df, percentage=0.5)

In [None]:
df.shape

In [None]:
# –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –Ω–∞ –æ–±—É—á–∞—é—â—É—é –∏ —Ç–µ—Å—Ç–æ–≤—É—é –≤—ã–±–æ—Ä–∫–∏
print("‚ûó –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö –Ω–∞ –æ–±—É—á–∞—é—â—É—é –∏ —Ç–µ—Å—Ç–æ–≤—É—é –≤—ã–±–æ—Ä–∫–∏...")
train_df, test_df = train_test_split(df, test_size=0.001, random_state=42)
print("‚úÖ –î–∞–Ω–Ω—ã–µ —Ä–∞–∑–¥–µ–ª–µ–Ω—ã.")

In [None]:
test_df.shape

In [54]:
class TorchLDA(nn.Module):
    """–†–µ–∞–ª–∏–∑–∞—Ü–∏—è LDA –Ω–∞ PyTorch.

    –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ—Ç –∏ –æ–±—É—á–∞–µ—Ç –º–æ–¥–µ–ª—å LDA, –∏—Å–ø–æ–ª—å–∑—É—è EM-–∞–ª–≥–æ—Ä–∏—Ç–º.

    –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
        n_topics (int): –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–µ–º.
        n_vocab (int): –†–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è.
        device (torch.device): –£—Å—Ç—Ä–æ–π—Å—Ç–≤–æ –¥–ª—è –≤—ã—á–∏—Å–ª–µ–Ω–∏–π (CPU –∏–ª–∏ GPU).
        alpha (float): –ü–∞—Ä–∞–º–µ—Ç—Ä —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏–∏ –¥–ª—è —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –ø–æ —Ç–µ–º–∞–º.
        beta (float): –ü–∞—Ä–∞–º–µ—Ç—Ä —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏–∏ –¥–ª—è —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è —Ç–µ–º –ø–æ —Å–ª–æ–≤–∞–º.
        max_iterations (int): –ú–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –∏—Ç–µ—Ä–∞—Ü–∏–π EM-–∞–ª–≥–æ—Ä–∏—Ç–º–∞.
        tolerance (float): –ü–æ—Ä–æ–≥ —Å—Ö–æ–¥–∏–º–æ—Å—Ç–∏ –¥–ª—è EM-–∞–ª–≥–æ—Ä–∏—Ç–º–∞.
    """
    def __init__(self, n_topics, n_vocab, device, alpha=0.1, beta=0.01, max_iterations=100, tolerance=1e-4):
        super().__init__()
        self.n_topics = n_topics
        self.n_vocab = n_vocab
        self.device = device
        self.alpha = alpha
        self.beta = beta
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.topic_term_matrix = nn.Parameter(torch.randn(n_topics, n_vocab, device=device).abs())
        self.doc_topic_matrix = None
        self.norm_topic_term_matrix = None

    def initialize_parameters(self, docs):
        """–ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ—Ç –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –º–æ–¥–µ–ª–∏ LDA.

        –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ—Ç –º–∞—Ç—Ä–∏—Ü—É —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –ø–æ —Ç–µ–º–∞–º –∏ –º–∞—Ç—Ä–∏—Ü—É —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è —Ç–µ–º –ø–æ —Å–ª–æ–≤–∞–º —Å–ª—É—á–∞–π–Ω—ã–º–∏ –∑–Ω–∞—á–µ–Ω–∏—è–º–∏.

        –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
            docs (torch.Tensor): –ú–∞—Ç—Ä–∏—Ü–∞ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ (—Ä–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å: –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ x —Ä–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è).
        """
        self.doc_topic_matrix = torch.rand(docs.shape[0], self.n_topics, device=self.device).abs()
        self.topic_term_matrix.data = torch.randn(self.n_topics, self.n_vocab, device=self.device).abs()

    def fit(self, docs, log=False):
        """–û–±—É—á–∞–µ—Ç –º–æ–¥–µ–ª—å LDA –Ω–∞ –æ—Å–Ω–æ–≤–µ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤, –∏—Å–ø–æ–ª—å–∑—É—è EM-–∞–ª–≥–æ—Ä–∏—Ç–º.

        –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
            docs (torch.Tensor): –ú–∞—Ç—Ä–∏—Ü–∞ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ (—Ä–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å: –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ x —Ä–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è).
            log (bool, optional): –í–∫–ª—é—á–∞–µ—Ç –≤—ã–≤–æ–¥ –ª–æ–≥–æ–≤ –≤–æ –≤—Ä–µ–º—è –æ–±—É—á–µ–Ω–∏—è. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é False.

        –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
            TorchLDA: –û–±—É—á–µ–Ω–Ω–∞—è –º–æ–¥–µ–ª—å LDA.
        """
        if log: print("LDA Fit started")
        self.initialize_parameters(docs)
        docs = docs.to(self.device)
        prev_likelihood = float('-inf')
        for iteration in range(self.max_iterations):
            doc_topic_distribution = self.expect(docs)
            self.topic_term_matrix = self.maximize(docs, doc_topic_distribution)
            current_likelihood = self.likelihood(docs, doc_topic_distribution)
            if log: print(f"Iteration {iteration+1}, Likelihood {current_likelihood:.2f}")
            if abs(current_likelihood - prev_likelihood) < self.tolerance:
                if log: print("LDA Converged")
                break
            prev_likelihood = current_likelihood
        self.norm_topic_term_matrix = self.normalize(self.topic_term_matrix)
        if log: print("LDA Fit ended")
        return self

    def expect(self, docs):
        """–í—ã–ø–æ–ª–Ω—è–µ—Ç E-—à–∞–≥ EM-–∞–ª–≥–æ—Ä–∏—Ç–º–∞ –¥–ª—è LDA.

        –û—Ü–µ–Ω–∏–≤–∞–µ—Ç —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –ø–æ —Ç–µ–º–∞–º, —É—á–∏—Ç—ã–≤–∞—è —Ç–µ–∫—É—â—É—é –º–∞—Ç—Ä–∏—Ü—É —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è —Ç–µ–º –ø–æ —Å–ª–æ–≤–∞–º.

        –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
            docs (torch.Tensor): –ú–∞—Ç—Ä–∏—Ü–∞ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤.

        –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
            torch.Tensor: –ú–∞—Ç—Ä–∏—Ü–∞ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –ø–æ —Ç–µ–º–∞–º.
        """
        doc_topic_distribution = torch.matmul(docs, self.topic_term_matrix.T) + self.alpha
        doc_topic_distribution = self.normalize(doc_topic_distribution)
        return doc_topic_distribution

    def maximize(self, docs, doc_topic_distribution):
        """–í—ã–ø–æ–ª–Ω—è–µ—Ç M-—à–∞–≥ EM-–∞–ª–≥–æ—Ä–∏—Ç–º–∞ –¥–ª—è LDA.

        –û–±–Ω–æ–≤–ª—è–µ—Ç –º–∞—Ç—Ä–∏—Ü—É —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è —Ç–µ–º –ø–æ —Å–ª–æ–≤–∞–º, –æ—Å–Ω–æ–≤—ã–≤–∞—è—Å—å –Ω–∞ –æ—Ü–µ–Ω–µ–Ω–Ω–æ–º —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–∏ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –ø–æ —Ç–µ–º–∞–º.

        –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
            docs (torch.Tensor): –ú–∞—Ç—Ä–∏—Ü–∞ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤.
            doc_topic_distribution (torch.Tensor): –ú–∞—Ç—Ä–∏—Ü–∞ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –ø–æ —Ç–µ–º–∞–º.

        –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
            torch.Tensor: –û–±–Ω–æ–≤–ª–µ–Ω–Ω–∞—è –º–∞—Ç—Ä–∏—Ü–∞ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è —Ç–µ–º –ø–æ —Å–ª–æ–≤–∞–º.
        """
        topic_term_matrix = torch.matmul(doc_topic_distribution.T, docs) + self.beta
        return topic_term_matrix

    def likelihood(self, docs, doc_topic_distribution):
        """–í—ã—á–∏—Å–ª—è–µ—Ç –ª–æ–≥–∞—Ä–∏—Ñ–º–∏—á–µ—Å–∫–æ–µ –ø—Ä–∞–≤–¥–æ–ø–æ–¥–æ–±–∏–µ –¥–ª—è —Ç–µ–∫—É—â–µ–π –∏—Ç–µ—Ä–∞—Ü–∏–∏ EM-–∞–ª–≥–æ—Ä–∏—Ç–º–∞.

        –ò—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è –¥–ª—è –æ—Ü–µ–Ω–∫–∏ —Å—Ö–æ–¥–∏–º–æ—Å—Ç–∏ –∞–ª–≥–æ—Ä–∏—Ç–º–∞.

        –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
            docs (torch.Tensor): –ú–∞—Ç—Ä–∏—Ü–∞ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤.
            doc_topic_distribution (torch.Tensor): –ú–∞—Ç—Ä–∏—Ü–∞ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –ø–æ —Ç–µ–º–∞–º.

        –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
            float: –ó–Ω–∞—á–µ–Ω–∏–µ –ª–æ–≥–∞—Ä–∏—Ñ–º–∏—á–µ—Å–∫–æ–≥–æ –ø—Ä–∞–≤–¥–æ–ø–æ–¥–æ–±–∏—è.
        """
        log_likelihood = torch.sum(docs * torch.log(torch.matmul(doc_topic_distribution, self.normalize(self.topic_term_matrix))))
        return log_likelihood.item()

    def normalize(self, matrix):
        """–ù–æ—Ä–º–∞–ª–∏–∑—É–µ—Ç –º–∞—Ç—Ä–∏—Ü—É, –ø—Ä–∏–≤–æ–¥—è —Å—É–º–º—ã —Å—Ç—Ä–æ–∫ –∫ –µ–¥–∏–Ω–∏—Ü–µ.

        –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
            matrix (torch.Tensor): –ú–∞—Ç—Ä–∏—Ü–∞ –¥–ª—è –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏–∏.

        –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
            torch.Tensor: –ù–æ—Ä–º–∞–ª–∏–∑–æ–≤–∞–Ω–Ω–∞—è –º–∞—Ç—Ä–∏—Ü–∞.
        """
        row_sums = matrix.sum(axis=1, keepdim=True)
        return matrix / row_sums

    def transform(self, docs):
        """–ü—Ä–µ–æ–±—Ä–∞–∑—É–µ—Ç –Ω–æ–≤—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –≤ –≤–µ–∫—Ç–æ—Ä–Ω–æ–µ –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–∏–µ –≤ –ø—Ä–æ—Å—Ç—Ä–∞–Ω—Å—Ç–≤–µ —Ç–µ–º.

        –ò—Å–ø–æ–ª—å–∑—É–µ—Ç –æ–±—É—á–µ–Ω–Ω—É—é –º–æ–¥–µ–ª—å LDA –¥–ª—è –ø—Ä–æ–µ—Ü–∏—Ä–æ–≤–∞–Ω–∏—è –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –≤ —Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–æ–µ –ø—Ä–æ—Å—Ç—Ä–∞–Ω—Å—Ç–≤–æ.

        –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
            docs (torch.Tensor): –ú–∞—Ç—Ä–∏—Ü–∞ –Ω–æ–≤—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤.

        –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
            torch.Tensor: –ú–∞—Ç—Ä–∏—Ü–∞ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –ø–æ —Ç–µ–º–∞–º –¥–ª—è –Ω–æ–≤—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤.

        –í—ã–∑—ã–≤–∞–µ—Ç ValueError, –µ—Å–ª–∏ –º–æ–¥–µ–ª—å LDA –µ—â–µ –Ω–µ –æ–±—É—á–µ–Ω–∞.
        """
        if self.norm_topic_term_matrix is None:
            raise ValueError("‚ùå –ú–æ–¥–µ–ª—å LDA –µ—â–µ –Ω–µ –æ–±—É—á–µ–Ω–∞.")
        docs = docs.to(self.device)
        doc_topic_distribution = torch.matmul(docs, self.norm_topic_term_matrix.T) + self.alpha
        return self.normalize(doc_topic_distribution)

In [55]:
def vectorize_owners(df, method='log_scale', scaler=None):
    """–í–µ–∫—Ç–æ—Ä–∏–∑—É–µ—Ç –¥–∞–Ω–Ω—ã–µ –æ –≤–ª–∞–¥–µ–ª—å—Ü–∞—Ö –∏–≥—Ä.

    –ü—Ä–∏–º–µ–Ω—è–µ—Ç —Ä–∞–∑–ª–∏—á–Ω—ã–µ –º–µ—Ç–æ–¥—ã –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏–∏ –∫ –¥–∞–Ω–Ω—ã–º –æ –ø—Ä–µ–¥–ø–æ–ª–∞–≥–∞–µ–º–æ–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ –≤–ª–∞–¥–µ–ª—å—Ü–µ–≤ –∏–≥—Ä.

    –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
        df (pd.DataFrame): DataFrame, —Å–æ–¥–µ—Ä–∂–∞—â–∏–π —Å—Ç–æ–ª–±–µ—Ü 'estimated_owners'.
        method (str, optional): –ú–µ—Ç–æ–¥ –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏–∏: 'log_scale' –∏–ª–∏ 'standard'. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é 'log_scale'.
        scaler (CumlMinMaxScaler, optional): –û–±—É—á–µ–Ω–Ω—ã–π scaler –¥–ª—è –º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏—è –¥–∞–Ω–Ω—ã—Ö. –ï—Å–ª–∏ None, scaler –Ω–µ –ø—Ä–∏–º–µ–Ω—è–µ—Ç—Å—è.

    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
        np.ndarray: –í–µ–∫—Ç–æ—Ä–∏–∑–æ–≤–∞–Ω–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ –æ –≤–ª–∞–¥–µ–ª—å—Ü–∞—Ö.

    –í—ã–∑—ã–≤–∞–µ—Ç ValueError, –µ—Å–ª–∏ —É–∫–∞–∑–∞–Ω –Ω–µ–¥–æ–ø—É—Å—Ç–∏–º—ã–π –º–µ—Ç–æ–¥ –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏–∏.
    """
    owners = df['estimated_owners'].values.reshape(-1, 1)
    owners = np.array(owners, dtype=float)
    owners = np.nan_to_num(owners, nan=0)
    if method == 'log_scale':
        owners = np.log1p(owners)
        if scaler is not None:
           owners = scaler.transform(owners)
        owners_weighted = owners * (1 + (owners * 2))
        return owners_weighted
    elif method == 'standard':
        if scaler is not None:
           owners = scaler.transform(owners)
        return owners
    else:
        raise ValueError("‚ùå –ù–µ–¥–æ–ø—É—Å—Ç–∏–º—ã–π –º–µ—Ç–æ–¥ –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏–∏ –≤–ª–∞–¥–µ–ª—å—Ü–µ–≤.")

In [56]:
def vectorize_tags(df, multilabel_params=None):
    """–í–µ–∫—Ç–æ—Ä–∏–∑—É–µ—Ç —Ç–µ–≥–∏ –∏–≥—Ä, –∏—Å–ø–æ–ª—å–∑—É—è MultiLabelBinarizer.

    –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ—Ç —Å–ø–∏—Å–æ–∫ —Ç–µ–≥–æ–≤ –¥–ª—è –∫–∞–∂–¥–æ–π –∏–≥—Ä—ã –≤ –º–Ω–æ–≥–æ–º–µ—Ä–Ω—ã–π –±–∏–Ω–∞—Ä–Ω—ã–π –≤–µ–∫—Ç–æ—Ä.

    –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
        df (pd.DataFrame): DataFrame, —Å–æ–¥–µ—Ä–∂–∞—â–∏–π —Å—Ç–æ–ª–±–µ—Ü 'all_tags' —Å–æ —Å–ø–∏—Å–∫–∞–º–∏ —Ç–µ–≥–æ–≤.
        multilabel_params (dict, optional): –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –¥–ª—è MultiLabelBinarizer. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é {'sparse_output': False}.

    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
        tuple: –ö–æ—Ä—Ç–µ–∂, —Å–æ–¥–µ—Ä–∂–∞—â–∏–π:
            - np.ndarray: –í–µ–∫—Ç–æ—Ä–∏–∑–æ–≤–∞–Ω–Ω—ã–µ —Ç–µ–≥–∏.
            - MultiLabelBinarizer: –û–±—É—á–µ–Ω–Ω—ã–π –æ–±—ä–µ–∫—Ç MultiLabelBinarizer.
    """
    default_params = {'sparse_output': False}
    params = multilabel_params if multilabel_params else default_params
    mlb = MultiLabelBinarizer(**params)
    mlb.fit(df['all_tags'])
    tags_vectorized = mlb.transform(df['all_tags'])
    return tags_vectorized, mlb

In [57]:
def vectorize_descriptions(df, nmf_params=None, lda_params=None, vectorizer_cuml=None):
    """–í–µ–∫—Ç–æ—Ä–∏–∑—É–µ—Ç –æ–ø–∏—Å–∞–Ω–∏—è –∏–≥—Ä, –∏—Å–ø–æ–ª—å–∑—É—è TF-IDF –∏ NMF –∏–ª–∏ LDA.

    –ò—Å–ø–æ–ª—å–∑—É–µ—Ç CumlTfidfVectorizer –¥–ª—è –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è —Ç–µ–∫—Å—Ç–æ–≤—ã—Ö –æ–ø–∏—Å–∞–Ω–∏–π –≤ –≤–µ–∫—Ç–æ—Ä—ã TF-IDF,
    –∞ –∑–∞—Ç–µ–º –ø—Ä–∏–º–µ–Ω—è–µ—Ç NMF –∏–ª–∏ LDA –¥–ª—è —Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–æ–≥–æ –º–æ–¥–µ–ª–∏—Ä–æ–≤–∞–Ω–∏—è.

    –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
        df (pd.DataFrame): DataFrame, —Å–æ–¥–µ—Ä–∂–∞—â–∏–π —Å—Ç–æ–ª–±–µ—Ü 'short_description_clean' —Å –æ—á–∏—â–µ–Ω–Ω—ã–º–∏ –æ–ø–∏—Å–∞–Ω–∏—è–º–∏.
        nmf_params (dict, optional): –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –¥–ª—è NMF. –ï—Å–ª–∏ —É–∫–∞–∑–∞–Ω—ã, –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è NMF.
        lda_params (dict, optional): –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –¥–ª—è LDA. –ï—Å–ª–∏ —É–∫–∞–∑–∞–Ω—ã, –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è LDA.
        vectorizer_cuml (CumlTfidfVectorizer, optional): –û–±—É—á–µ–Ω–Ω—ã–π CumlTfidfVectorizer.

    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
        tuple: –ö–æ—Ä—Ç–µ–∂, —Å–æ–¥–µ—Ä–∂–∞—â–∏–π:
            - np.ndarray: –í–µ–∫—Ç–æ—Ä–∏–∑–æ–≤–∞–Ω–Ω—ã–µ –æ–ø–∏—Å–∞–Ω–∏—è (—Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–∏–µ –≤–µ–∫—Ç–æ—Ä—ã).
            - NMF –∏–ª–∏ LatentDirichletAllocation: –û–±—É—á–µ–Ω–Ω–∞—è –º–æ–¥–µ–ª—å NMF –∏–ª–∏ LDA.

    –í—ã–∑—ã–≤–∞–µ—Ç ValueError, –µ—Å–ª–∏ –Ω–µ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω –æ–±—É—á–µ–Ω–Ω—ã–π CumlTfidfVectorizer –∏–ª–∏ –Ω–µ —É–∫–∞–∑–∞–Ω—ã –ø–∞—Ä–∞–º–µ—Ç—Ä—ã nmf_params –∏–ª–∏ lda_params.
    """
    if vectorizer_cuml is None:
        raise ValueError("‚ùå –ù–µ–æ–±—Ö–æ–¥–∏–º–æ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–∏—Ç—å –æ–±—É—á–µ–Ω–Ω—ã–π CumlTfidfVectorizer.")
    desc_vectorized_cuml = vectorizer_cuml.transform(df['short_description_clean'])
    desc_vectorized_cuml_cpu = desc_vectorized_cuml.get()
    data = cp.asnumpy(desc_vectorized_cuml_cpu.data)
    indices = cp.asnumpy(desc_vectorized_cuml_cpu.indices)
    indptr = cp.asnumpy(desc_vectorized_cuml_cpu.indptr)
    shape = desc_vectorized_cuml_cpu.shape
    desc_vectorized_cpu = csr_matrix((data, indices, indptr), shape=shape)

    if nmf_params:
        nmf = NMF(**nmf_params)
        nmf_vectorized = nmf.fit_transform(desc_vectorized_cpu)
        return nmf_vectorized, nmf
    elif lda_params:
        lda = LatentDirichletAllocation(**lda_params)
        lda_vectorized = lda.fit_transform(desc_vectorized_cpu)
        return lda_vectorized, lda
    else:
        raise ValueError("‚ùå –ù–µ–æ–±—Ö–æ–¥–∏–º–æ —É–∫–∞–∑–∞—Ç—å nmf_params –∏–ª–∏ lda_params")

In [58]:
def calculate_topic_coherence(model, vectorizer, texts):
    """–í—ã—á–∏—Å–ª—è–µ—Ç –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å —Ç–µ–º—ã –º–æ–¥–µ–ª–∏.

    –ò—Å–ø–æ–ª—å–∑—É–µ—Ç UMass coherence –¥–ª—è –æ—Ü–µ–Ω–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ —Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–æ–π –º–æ–¥–µ–ª–∏.

    –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
        model: –û–±—É—á–µ–Ω–Ω–∞—è —Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–∞—è –º–æ–¥–µ–ª—å (NMF –∏–ª–∏ LDA) —Å –∞—Ç—Ä–∏–±—É—Ç–æ–º 'components_'.
        vectorizer: –û–±—É—á–µ–Ω–Ω—ã–π TF-IDF –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä —Å –º–µ—Ç–æ–¥–æ–º 'get_feature_names'.
        texts (list): –°–ø–∏—Å–æ–∫ —Ç–µ–∫—Å—Ç–æ–≤, –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–Ω—ã—Ö –¥–ª—è –æ–±—É—á–µ–Ω–∏—è –º–æ–¥–µ–ª–∏.

    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
        float: –ó–Ω–∞—á–µ–Ω–∏–µ –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç–∏ —Ç–µ–º—ã. –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç -999 –≤ —Å–ª—É—á–∞–µ –æ—à–∏–±–∫–∏.
    """
    try:
        feature_names_cuml = vectorizer.get_feature_names()

        if isinstance(feature_names_cuml, cudf.core.series.Series):
            feature_names = feature_names_cuml.to_pandas().tolist()
        elif not isinstance(feature_names_cuml, list):
            return -999

        if hasattr(model, 'components_') and feature_names is not None:
            topic_vectors = model.components_
            if isinstance(topic_vectors, cp.ndarray):
                topic_vectors_np = topic_vectors.get()
            else:
                topic_vectors_np = topic_vectors

            top_words_idx = topic_vectors_np.argsort()[:, ::-1]
            top_words = [[feature_names[i] for i in topic_word_idx[:10]] for topic_word_idx in top_words_idx]

            dictionary = Dictionary([text.split() for text in texts])
            tokenized_texts = [text.split() for text in texts]

            cm = CoherenceModel(topics=top_words, texts=tokenized_texts, dictionary=dictionary, coherence='u_mass')

            coherence_score = cm.get_coherence()
            return coherence_score
        else:
            return -999
    except Exception:
        return -999

In [59]:
def calculate_topic_diversity(model):
    """–í—ã—á–∏—Å–ª—è–µ—Ç —Ä–∞–∑–Ω–æ–æ–±—Ä–∞–∑–∏–µ —Ç–µ–º –Ω–∞ –æ—Å–Ω–æ–≤–µ –∫–æ—Å–∏–Ω—É—Å–Ω–æ–≥–æ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏—è.

    –ò–∑–º–µ—Ä—è–µ—Ç —Ä–∞–∑–Ω–æ–æ–±—Ä–∞–∑–∏–µ —Ç–µ–º –≤ –º–æ–¥–µ–ª–∏, —Ä–∞—Å—Å—á–∏—Ç—ã–≤–∞—è —Å—Ä–µ–¥–Ω–µ–µ –∫–æ—Å–∏–Ω—É—Å–Ω–æ–µ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ –º–µ–∂–¥—É –≤–µ–∫—Ç–æ—Ä–∞–º–∏ —Ç–µ–º.

    –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
        model: –û–±—É—á–µ–Ω–Ω–∞—è —Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–∞—è –º–æ–¥–µ–ª—å (NMF –∏–ª–∏ LDA) —Å –∞—Ç—Ä–∏–±—É—Ç–æ–º 'components_'.

    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
        float: –°—Ä–µ–¥–Ω–µ–µ –∫–æ—Å–∏–Ω—É—Å–Ω–æ–µ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ –º–µ–∂–¥—É —Ç–µ–º–∞–º–∏, –ø—Ä–µ–¥—Å—Ç–∞–≤–ª—è—é—â–µ–µ —Ä–∞–∑–Ω–æ–æ–±—Ä–∞–∑–∏–µ —Ç–µ–º.
               –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç -1, –µ—Å–ª–∏ –º–æ–¥–µ–ª—å –Ω–µ –∏–º–µ–µ—Ç –∞—Ç—Ä–∏–±—É—Ç–∞ 'components_' –∏–ª–∏ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–µ–º –º–µ–Ω—å—à–µ 2.
    """
    if not hasattr(model, 'components_'):
        print("‚ö†Ô∏è –ú–æ–¥–µ–ª—å –Ω–µ –∏–º–µ–µ—Ç –∞—Ç—Ä–∏–±—É—Ç–∞ components_.")
        return -1

    topic_vectors = model.components_
    if isinstance(topic_vectors, cp.ndarray):
        topic_vectors_np = topic_vectors.get()
    else:
        topic_vectors_np = topic_vectors

    if topic_vectors_np.shape[0] < 2:
        print("‚ö†Ô∏è –ú–µ–Ω–µ–µ –¥–≤—É—Ö —Ç–µ–º. –ù–µ–≤–æ–∑–º–æ–∂–Ω–æ –≤—ã—á–∏—Å–ª–∏—Ç—å —Ä–∞–∑–Ω–æ–æ–±—Ä–∞–∑–∏–µ.")
        return -1

    num_topics = topic_vectors_np.shape[0]
    total_similarity = 0
    num_pairs = 0

    for i in range(num_topics):
      for j in range(i+1, num_topics):
         similarity = cosine_similarity(topic_vectors_np[i].reshape(1, -1), topic_vectors_np[j].reshape(1, -1))[0][0]
         total_similarity += similarity
         num_pairs += 1

    if num_pairs == 0:
      return -1

    average_similarity = total_similarity / num_pairs
    average_distance = 1 - average_similarity
    return average_distance

In [60]:
def calculate_intra_topic_diversity(model, feature_names, num_top_words=10):
    """–í—ã—á–∏—Å–ª—è–µ—Ç —Ä–∞–∑–Ω–æ–æ–±—Ä–∞–∑–∏–µ —Å–ª–æ–≤ –≤–Ω—É—Ç—Ä–∏ –∫–∞–∂–¥–æ–π —Ç–µ–º—ã, –∏—Å–ø–æ–ª—å–∑—É—è —ç–Ω—Ç—Ä–æ–ø–∏—é.

    –û—Ü–µ–Ω–∏–≤–∞–µ—Ç, –Ω–∞—Å–∫–æ–ª—å–∫–æ —Ä–∞–∑–Ω–æ–æ–±—Ä–∞–∑–Ω—ã —Å–ª–æ–≤–∞ –≤–Ω—É—Ç—Ä–∏ –∫–∞–∂–¥–æ–π —Ç–µ–º—ã, –∏—Å–ø–æ–ª—å–∑—É—è —ç–Ω—Ç—Ä–æ–ø–∏—é —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–µ–π —Ç–æ–ø-—Å–ª–æ–≤.

    –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
        model: –û–±—É—á–µ–Ω–Ω–∞—è —Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–∞—è –º–æ–¥–µ–ª—å (NMF –∏–ª–∏ LDA) —Å –∞—Ç—Ä–∏–±—É—Ç–æ–º 'components_'.
        feature_names (list): –°–ø–∏—Å–æ–∫ –Ω–∞–∑–≤–∞–Ω–∏–π –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ (—Å–ª–æ–≤) –∏–∑ TF-IDF –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä–∞.
        num_top_words (int, optional): –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–æ–ø-—Å–ª–æ–≤, —É—á–∏—Ç—ã–≤–∞–µ–º—ã—Ö –¥–ª—è —Ä–∞—Å—á–µ—Ç–∞ —ç–Ω—Ç—Ä–æ–ø–∏–∏. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é 10.

    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
        float: –°—Ä–µ–¥–Ω—è—è —ç–Ω—Ç—Ä–æ–ø–∏—è –ø–æ –≤—Å–µ–º —Ç–µ–º–∞–º, –ø—Ä–µ–¥—Å—Ç–∞–≤–ª—è—é—â–∞—è –≤–Ω—É—Ç—Ä–∏—Ç–æ–ø–∏–∫–æ–≤–æ–µ —Ä–∞–∑–Ω–æ–æ–±—Ä–∞–∑–∏–µ.
               –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç -1, –µ—Å–ª–∏ –º–æ–¥–µ–ª—å –Ω–µ –∏–º–µ–µ—Ç –∞—Ç—Ä–∏–±—É—Ç–∞ 'components_' –∏–ª–∏ –Ω–µ—Ç —Ç–µ–º –¥–ª—è —Ä–∞—Å—á–µ—Ç–∞.
    """
    if not hasattr(model, 'components_'):
        print("‚ö†Ô∏è –ú–æ–¥–µ–ª—å –Ω–µ –∏–º–µ–µ—Ç –∞—Ç—Ä–∏–±—É—Ç–∞ components_.")
        return -1

    topic_vectors = model.components_
    if isinstance(topic_vectors, cp.ndarray):
        topic_vectors_np = topic_vectors.get()
    else:
        topic_vectors_np = topic_vectors
    num_topics = topic_vectors_np.shape[0]

    if num_topics == 0:
        print("‚ö†Ô∏è –ù–µ—Ç —Ç–µ–º –¥–ª—è —Ä–∞—Å—á–µ—Ç–∞ —Ä–∞–∑–Ω–æ–æ–±—Ä–∞–∑–∏—è.")
        return -1

    topic_entropies = []
    for topic in topic_vectors_np:
        top_word_indices = np.argsort(topic)[::-1][:num_top_words]
        top_word_probabilities = topic[top_word_indices]
        normalized_probabilities = top_word_probabilities / np.sum(top_word_probabilities)
        topic_entropy = entropy(normalized_probabilities, base=2)
        topic_entropies.append(topic_entropy)

    return np.mean(topic_entropies) if topic_entropies else -1

In [61]:
def display_topics(model, feature_names, num_top_words=10):
    """–í—ã–≤–æ–¥–∏—Ç –Ω–∞–∏–±–æ–ª–µ–µ –∑–Ω–∞—á–∏–º—ã–µ —Å–ª–æ–≤–∞ –¥–ª—è –∫–∞–∂–¥–æ–π —Ç–µ–º—ã.

    –û—Ç–æ–±—Ä–∞–∂–∞–µ—Ç —Ç–æ–ø-—Å–ª–æ–≤–∞, –∫–æ—Ç–æ—Ä—ã–µ –Ω–∞–∏–±–æ–ª–µ–µ –≤–∞–∂–Ω—ã –¥–ª—è –∫–∞–∂–¥–æ–π —Ç–µ–º—ã –≤ —Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–æ–π –º–æ–¥–µ–ª–∏.

    –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
        model: –û–±—É—á–µ–Ω–Ω–∞—è —Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–∞—è –º–æ–¥–µ–ª—å (NMF –∏–ª–∏ LDA) —Å –∞—Ç—Ä–∏–±—É—Ç–æ–º 'components_'.
        feature_names (list): –°–ø–∏—Å–æ–∫ –Ω–∞–∑–≤–∞–Ω–∏–π –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ (—Å–ª–æ–≤) –∏–∑ TF-IDF –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä–∞.
        num_top_words (int, optional): –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–æ–ø-—Å–ª–æ–≤ –¥–ª—è –æ—Ç–æ–±—Ä–∞–∂–µ–Ω–∏—è –¥–ª—è –∫–∞–∂–¥–æ–π —Ç–µ–º—ã. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é 10.
    """
    for topic_idx, topic in enumerate(model.components_):
        print(f"   –¢–µ–º–∞ #{topic_idx}:", end=' ')
        top_word_indices = topic.argsort()[::-1][:num_top_words]
        top_words = [feature_names[i] for i in top_word_indices]
        print(" ".join(top_words))
    print()

In [62]:
def display_topics_with_diversity(model, feature_names, num_top_words=25, num_display_words = 10):
    """–í—ã–≤–æ–¥–∏—Ç —Ç–æ–ø-—Å–ª–æ–≤–∞ –¥–ª—è –∫–∞–∂–¥–æ–π —Ç–µ–º—ã –∏ –∏—Ö —ç–Ω—Ç—Ä–æ–ø–∏—é.

    –û—Ç–æ–±—Ä–∞–∂–∞–µ—Ç –Ω–∞–∏–±–æ–ª–µ–µ –∑–Ω–∞—á–∏–º—ã–µ —Å–ª–æ–≤–∞ –¥–ª—è –∫–∞–∂–¥–æ–π —Ç–µ–º—ã, –∞ —Ç–∞–∫–∂–µ –∑–Ω–∞—á–µ–Ω–∏–µ —ç–Ω—Ç—Ä–æ–ø–∏–∏ –¥–ª—è –æ—Ü–µ–Ω–∫–∏ —Ä–∞–∑–Ω–æ–æ–±—Ä–∞–∑–∏—è —Å–ª–æ–≤ –≤ —Ç–µ–º–µ.

    –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
        model: –û–±—É—á–µ–Ω–Ω–∞—è —Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–∞—è –º–æ–¥–µ–ª—å (NMF –∏–ª–∏ LDA) —Å –∞—Ç—Ä–∏–±—É—Ç–æ–º 'components_'.
        feature_names (list): –°–ø–∏—Å–æ–∫ –Ω–∞–∑–≤–∞–Ω–∏–π –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ (—Å–ª–æ–≤) –∏–∑ TF-IDF –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä–∞.
        num_top_words (int, optional): –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–æ–ø-—Å–ª–æ–≤, —Ä–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞–µ–º—ã—Ö –¥–ª—è —Ä–∞—Å—á–µ—Ç–∞ —ç–Ω—Ç—Ä–æ–ø–∏–∏ –∏ –æ—Ç–±–æ—Ä–∞. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é 25.
        num_display_words (int, optional): –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–æ–ø-—Å–ª–æ–≤ –¥–ª—è –æ—Ç–æ–±—Ä–∞–∂–µ–Ω–∏—è –¥–ª—è –∫–∞–∂–¥–æ–π —Ç–µ–º—ã. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é 10.
    """
    if not hasattr(model, 'components_'):
        print("‚ö†Ô∏è –ú–æ–¥–µ–ª—å –Ω–µ –∏–º–µ–µ—Ç –∞—Ç—Ä–∏–±—É—Ç–∞ components_.")
        return

    topic_vectors = model.components_
    if isinstance(topic_vectors, cp.ndarray):
        topic_vectors_np = topic_vectors.get()
    else:
        topic_vectors_np = topic_vectors

    for topic_idx, topic in enumerate(topic_vectors_np):
        print(f"   –¢–µ–º–∞ #{topic_idx}. ", end=' ')
        top_word_indices = np.argsort(topic)[::-1][:num_top_words]
        top_words = [feature_names[i] for i in top_word_indices]
        print(f"   –¢–æ–ø-{num_display_words} —Å–ª–æ–≤: {' '.join(top_words[:num_display_words])}")
        normalized_probabilities = topic[top_word_indices] / np.sum(topic[top_word_indices])
        topic_entropy = entropy(normalized_probabilities, base=2)
        print(f"   –≠–Ω—Ç—Ä–æ–ø–∏—è —Ç–µ–º—ã: {topic_entropy:.4f}")
    print()

In [63]:
class CombinedVectorizer(BaseEstimator, TransformerMixin):
    """–ö–æ–º–±–∏–Ω–∏—Ä–æ–≤–∞–Ω–Ω—ã–π –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä –¥–ª—è –æ–±—Ä–∞–±–æ—Ç–∫–∏ —Ä–∞–∑–ª–∏—á–Ω—ã—Ö —Ç–∏–ø–æ–≤ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤.

    –í–µ–∫—Ç–æ—Ä–∏–∑—É–µ—Ç –¥–∞–Ω–Ω—ã–µ –æ –≤–ª–∞–¥–µ–ª—å—Ü–∞—Ö, —Ç–µ–≥–∏ –∏ —Ç–µ–∫—Å—Ç–æ–≤—ã–µ –æ–ø–∏—Å–∞–Ω–∏—è –∏–≥—Ä, –∏—Å–ø–æ–ª—å–∑—É—è —Ä–∞–∑–ª–∏—á–Ω—ã–µ –º–µ—Ç–æ–¥—ã –∏ –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä—ã.

    –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
        owners_method (str, optional): –ú–µ—Ç–æ–¥ –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏–∏ –¥–ª—è –¥–∞–Ω–Ω—ã—Ö –æ –≤–ª–∞–¥–µ–ª—å—Ü–∞—Ö ('log_scale' –∏–ª–∏ 'standard'). –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é 'log_scale'.
        multilabel_params (dict, optional): –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –¥–ª—è MultiLabelBinarizer. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é None.
        nmf_params (dict, optional): –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –¥–ª—è NMF. –ï—Å–ª–∏ —É–∫–∞–∑–∞–Ω—ã, –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è NMF –¥–ª—è –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏–∏ –æ–ø–∏—Å–∞–Ω–∏–π. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é None.
        lda_params (dict, optional): –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –¥–ª—è LDA. –ï—Å–ª–∏ —É–∫–∞–∑–∞–Ω—ã, –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è LDA –¥–ª—è –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏–∏ –æ–ø–∏—Å–∞–Ω–∏–π. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é None.
        tag_weight (float, optional): –í–µ—Å, –ø—Ä–∏–º–µ–Ω—è–µ–º—ã–π –∫ –≤–µ–∫—Ç–æ—Ä–∏–∑–æ–≤–∞–Ω–Ω—ã–º —Ç–µ–≥–∞–º. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é 1.0.
        tfidf_cuml_params (dict, optional): –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –¥–ª—è CumlTfidfVectorizer. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é None.
    """
    def __init__(self, owners_method='log_scale', multilabel_params=None, nmf_params=None, lda_params=None, tag_weight=1.0, tfidf_cuml_params=None):
        self.owners_method = owners_method
        self.multilabel_params = multilabel_params
        self.nmf_params = nmf_params
        self.lda_params = lda_params
        self.tag_weight = tag_weight
        self.tfidf_cuml_params = tfidf_cuml_params if tfidf_cuml_params else {}
        self.tfidf_cuml = CumlTfidfVectorizer(**self.tfidf_cuml_params)
        self.mlb = None
        self.nmf = None
        self.lda = None
        self.tfidf_feature_names_out_ = None
        self.scaler = CumlMinMaxScaler()
        self.transformed_owners_vectors = None
        self.transformed_tags_vectors = None
        self.transformed_desc_vectors = None
        self.transformed_combined_vectors = None

    def fit(self, X, y=None):
        """–û–±—É—á–∞–µ—Ç –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä –Ω–∞ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö.

        –í—ã–ø–æ–ª–Ω—è–µ—Ç –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—é –≤–ª–∞–¥–µ–ª—å—Ü–µ–≤, —Ç–µ–≥–æ–≤ –∏ –æ–ø–∏—Å–∞–Ω–∏–π, –∞ —Ç–∞–∫–∂–µ –æ–±—É—á–∞–µ—Ç –≤–Ω—É—Ç—Ä–µ–Ω–Ω–∏–µ –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä—ã –∏ —Å–∫–∞–ª–µ—Ä—ã.

        –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
            X (pd.DataFrame): DataFrame, —Å–æ–¥–µ—Ä–∂–∞—â–∏–π –¥–∞–Ω–Ω—ã–µ –¥–ª—è –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏–∏ ('estimated_owners', 'all_tags', 'short_description_clean').
            y (None): –ù–µ –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è, –Ω—É–∂–µ–Ω –¥–ª—è —Å–æ–≤–º–µ—Å—Ç–∏–º–æ—Å—Ç–∏ API scikit-learn.

        –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
            CombinedVectorizer: –û–±—É—á–µ–Ω–Ω—ã–π –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä.
        """
        self.owners_vectors = vectorize_owners(X, method=self.owners_method)
        self.tags_vectors, self.mlb = vectorize_tags(X, multilabel_params=self.multilabel_params)
        if isinstance(self.tags_vectors, cp.sparse.csr_matrix):
            print("‚ÑπÔ∏è –í–µ–∫—Ç–æ—Ä—ã —Ç–µ–≥–æ–≤ - cupy sparse matrix, –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –≤ numpy...")
            self.tags_vectors = np.array(cp.asnumpy(self.tags_vectors.todense()), dtype=np.float64)
        if self.tags_vectors.ndim == 1:
            self.tags_vectors = self.tags_vectors.reshape(-1, 1)

        cleaned_descriptions = X['short_description_clean'].str.lower()
        self.tfidf_cuml.fit(cleaned_descriptions)
        self.tfidf_feature_names_out_ = [word for word, index in sorted(self.tfidf_cuml.vocabulary_.to_pandas().items(), key=lambda item: item[1])]

        if self.nmf_params and self.lda_params is None:
            self.desc_vectors, self.nmf = vectorize_descriptions(X, nmf_params=self.nmf_params, vectorizer_cuml=self.tfidf_cuml)
            self.lda = None
        elif self.lda_params and self.nmf_params is None:
             self.desc_vectors, self.lda = vectorize_descriptions(X, lda_params=self.lda_params, vectorizer_cuml=self.tfidf_cuml)
             self.nmf = None
        else:
            raise ValueError("‚ùå –ù–µ–æ–±—Ö–æ–¥–∏–º–æ —É–∫–∞–∑–∞—Ç—å nmf_params –∏–ª–∏ lda_params")
        print("‚úÖ –í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è –æ–ø–∏—Å–∞–Ω–∏–π –∑–∞–≤–µ—Ä—à–µ–Ω–∞")

        if self.nmf and hasattr(self.nmf, 'components_'):
             if np.isnan(self.nmf.components_).any():
                print("‚ö†Ô∏è –û–±–Ω–∞—Ä—É–∂–µ–Ω—ã NaN –∑–Ω–∞—á–µ–Ω–∏—è –≤ self.nmf.components_ –≤ fit()!")
        if self.lda and hasattr(self.lda, 'components_'):
             if np.isnan(self.lda.components_).any():
                 print("‚ö†Ô∏è –û–±–Ω–∞—Ä—É–∂–µ–Ω—ã NaN –∑–Ω–∞—á–µ–Ω–∏—è –≤ self.lda.components_ –≤ fit()!")

        owners_vectors = vectorize_owners(X, method=self.owners_method)
        self.scaler.fit(owners_vectors)
        return self

    def transform(self, X, y=None):
        """–¢—Ä–∞–Ω—Å—Ñ–æ—Ä–º–∏—Ä—É–µ—Ç –≤—Ö–æ–¥–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ –≤ –∫–æ–º–±–∏–Ω–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ –≤–µ–∫—Ç–æ—Ä—ã –ø—Ä–∏–∑–Ω–∞–∫–æ–≤.

        –ò—Å–ø–æ–ª—å–∑—É–µ—Ç –æ–±—É—á–µ–Ω–Ω—ã–µ –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä—ã –∏ —Å–∫–∞–ª–µ—Ä—ã –¥–ª—è –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –¥–∞–Ω–Ω—ã—Ö –æ –≤–ª–∞–¥–µ–ª—å—Ü–∞—Ö, —Ç–µ–≥–æ–≤ –∏ –æ–ø–∏—Å–∞–Ω–∏–π –≤ –µ–¥–∏–Ω–æ–µ –≤–µ–∫—Ç–æ—Ä–Ω–æ–µ –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–∏–µ.

        –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
            X (pd.DataFrame): DataFrame, —Å–æ–¥–µ—Ä–∂–∞—â–∏–π –¥–∞–Ω–Ω—ã–µ –¥–ª—è —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–∞—Ü–∏–∏.
            y (None): –ù–µ –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è, –Ω—É–∂–µ–Ω –¥–ª—è —Å–æ–≤–º–µ—Å—Ç–∏–º–æ—Å—Ç–∏ API scikit-learn.

        –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
            np.ndarray: –ú–∞—Ç—Ä–∏—Ü–∞ –∫–æ–º–±–∏–Ω–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö –≤–µ–∫—Ç–æ—Ä–æ–≤ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤.
        """
        owners_vectors = vectorize_owners(X, method=self.owners_method, scaler=self.scaler)
        owners_vectors = owners_vectors.reshape(owners_vectors.shape[0], -1)
        tags_vectors = self.mlb.transform(X['all_tags'])

        tag_weight = self.tag_weight
        tags_vectors_weighted = tags_vectors * tag_weight
        tags_vectors = tags_vectors_weighted

        tfidf_transformed_cuml = self.tfidf_cuml.transform(X['short_description_clean'])
        tfidf_transformed_cpu = tfidf_transformed_cuml.get()
        data = cp.asnumpy(tfidf_transformed_cpu.data)
        indices = cp.asnumpy(tfidf_transformed_cpu.indices)
        indptr = cp.asnumpy(tfidf_transformed_cpu.indptr)
        shape = tfidf_transformed_cpu.shape
        tfidf_transformed = csr_matrix((data, indices, indptr), shape=shape)

        desc_vectors = None
        if self.nmf_params:
            desc_vectors = self.nmf.transform(tfidf_transformed)
        elif self.lda_params:
            desc_vectors = self.lda.transform(tfidf_transformed)

        if desc_vectors is not None and desc_vectors.shape[0] != owners_vectors.shape[0]:
            raise ValueError(f"‚ùå –ù–µ—Å–æ–≤–ø–∞–¥–µ–Ω–∏–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ –æ–±—Ä–∞–∑—Ü–æ–≤ –º–µ–∂–¥—É –≤–µ–∫—Ç–æ—Ä–∞–º–∏ –≤–ª–∞–¥–µ–ª—å—Ü–µ–≤ –∏ –æ–ø–∏—Å–∞–Ω–∏–π: {owners_vectors.shape[0]} vs {desc_vectors.shape[0]}")

        self.transformed_owners_vectors = owners_vectors
        self.transformed_tags_vectors = tags_vectors
        self.transformed_desc_vectors = desc_vectors

        combined_vectors = np.hstack([owners_vectors, tags_vectors.toarray() if hasattr(tags_vectors, 'toarray') else tags_vectors, desc_vectors])
        self.transformed_combined_vectors = combined_vectors

        return combined_vectors

    def get_params(self, deep=True):
        """–í–æ–∑–≤—Ä–∞—â–∞–µ—Ç –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä–∞.

        –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç —Å–ª–æ–≤–∞—Ä—å –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ –¥–∞–Ω–Ω–æ–≥–æ –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä–∞, –≤–∫–ª—é—á–∞—è –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –¥–ª—è –≤—Å–µ—Ö –≤–Ω—É—Ç—Ä–µ–Ω–Ω–∏—Ö –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä–æ–≤ –∏ –º–µ—Ç–æ–¥–æ–≤.

        –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
            deep (bool, optional): –ï—Å–ª–∏ True, —Ç–∞–∫–∂–µ –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –¥–ª—è –≤–ª–æ–∂–µ–Ω–Ω—ã—Ö –æ–±—ä–µ–∫—Ç–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ —è–≤–ª—è—é—Ç—Å—è –æ—Ü–µ–Ω—â–∏–∫–∞–º–∏. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é True.

        –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
            dict: –°–ª–æ–≤–∞—Ä—å –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä–∞.
        """
        return {
            'owners_method': self.owners_method,
            'multilabel_params': self.multilabel_params,
             'nmf_params': self.nmf_params,
            'lda_params': self.lda_params,
            'tag_weight': self.tag_weight,
            'tfidf_cuml_params': self.tfidf_cuml_params
        }

    def set_params(self, **params):
        """–£—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ—Ç –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä–∞.

        –ü–æ–∑–≤–æ–ª—è–µ—Ç —É—Å—Ç–∞–Ω–æ–≤–∏—Ç—å –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä–∞ –ø–æ—Å–ª–µ –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏–∏.

        –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
            **params: –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä–∞ –≤ –≤–∏–¥–µ keyword arguments.

        –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
            CombinedVectorizer: –í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä —Å —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–Ω—ã–º–∏ –ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º–∏.
        """
        if 'owners_method' in params:
            self.owners_method = params['owners_method']
        if 'multilabel_params' in params:
            self.multilabel_params = params['multilabel_params']
        if 'nmf_params' in params:
            self.nmf_params = params['nmf_params']
        if 'lda_params' in params:
             self.lda_params = params['lda_params']
        if 'tag_weight' in params:
            self.tag_weight = params['tag_weight']
        if 'tfidf_cuml_params' in params:
            self.tfidf_cuml_params = params['tfidf_cuml_params']
            self.tfidf_cuml = CumlTfidfVectorizer(**self.tfidf_cuml_params)
        return self

In [64]:
def debug_vector_dimensions(model, train_df):
    """–í—ã–≤–æ–¥–∏—Ç –æ—Ç–ª–∞–¥–æ—á–Ω—É—é –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –æ —Ä–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—è—Ö –≤–µ–∫—Ç–æ—Ä–æ–≤ –≤ –º–æ–¥–µ–ª–∏.

    –ü–æ–º–æ–≥–∞–µ—Ç –ø–æ–Ω—è—Ç—å —Å—Ç—Ä—É–∫—Ç—É—Ä—É –∏ —Ä–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å –≤–µ–∫—Ç–æ—Ä–æ–≤, —Å–æ–∑–¥–∞–≤–∞–µ–º—ã—Ö Pipeline, –æ—Å–æ–±–µ–Ω–Ω–æ CombinedVectorizer.

    –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
        model: –û–±—É—á–µ–Ω–Ω–∞—è –º–æ–¥–µ–ª—å Pipeline —Å —ç—Ç–∞–ø–æ–º CombinedVectorizer.
        train_df (pd.DataFrame): DataFrame –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–∏.
    """
    if 'vectorizer' not in model.named_steps:
        print("‚ùå –û—à–∏–±–∫–∞: –í –º–æ–¥–µ–ª–∏ –æ—Ç—Å—É—Ç—Å—Ç–≤—É–µ—Ç —ç—Ç–∞–ø 'vectorizer'.")
        return

    vectorizer = model.named_steps['vectorizer']

    train_vectors = model.transform(train_df)
    print(f"üìê –§–æ—Ä–º–∞ –æ–±—â–∏—Ö –≤–µ–∫—Ç–æ—Ä–æ–≤ –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–∏: {train_vectors.shape}")

    owners_vector_size = 1
    if hasattr(vectorizer, 'mlb') and hasattr(vectorizer.mlb, 'classes_'):
        tags_vector_size = len(vectorizer.mlb.classes_)
    else:
        tags_vector_size = 0
        print("‚ö†Ô∏è –ü—Ä–µ–¥—É–ø—Ä–µ–∂–¥–µ–Ω–∏–µ: –ù–µ —É–¥–∞–ª–æ—Å—å –æ–ø—Ä–µ–¥–µ–ª–∏—Ç—å —Ä–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å –≤–µ–∫—Ç–æ—Ä–æ–≤ —Ç–µ–≥–æ–≤.")

    print(f"üìè –†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏–∏ –≤–ª–∞–¥–µ–ª—å—Ü–µ–≤: {owners_vector_size}")
    print(f"üìè –†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏–∏ —Ç–µ–≥–æ–≤: {tags_vector_size}")

    start_index_topics = owners_vector_size + tags_vector_size
    end_index_topics = train_vectors.shape[1]
    print(f"üìç –ù–∞—á–∞–ª—å–Ω—ã–π –∏–Ω–¥–µ–∫—Å —Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–∏—Ö –≤–µ–∫—Ç–æ—Ä–æ–≤: {start_index_topics}")
    print(f"üèÅ –ö–æ–Ω–µ—á–Ω—ã–π –∏–Ω–¥–µ–∫—Å —Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–∏—Ö –≤–µ–∫—Ç–æ—Ä–æ–≤: {end_index_topics}")

    train_topic_vectors = train_vectors[:, start_index_topics:end_index_topics]
    print(f"üìê –§–æ—Ä–º–∞ —Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–∏—Ö –≤–µ–∫—Ç–æ—Ä–æ–≤ –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–∏: {train_topic_vectors.shape}")

    if hasattr(vectorizer, 'transformed_owners_vectors'):
        print("üìè –†–∞–∑–º–µ—Ä –≤–µ–∫—Ç–æ—Ä–æ–≤ –≤–ª–∞–¥–µ–ª—å—Ü–µ–≤ (–≤–Ω—É—Ç—Ä–µ–Ω–Ω–∏–π):", vectorizer.transformed_owners_vectors.shape)
    if hasattr(vectorizer, 'transformed_tags_vectors'):
        print("üìè –†–∞–∑–º–µ—Ä –≤–µ–∫—Ç–æ—Ä–æ–≤ —Ç–µ–≥–æ–≤ (–≤–Ω—É—Ç—Ä–µ–Ω–Ω–∏–π):", vectorizer.transformed_tags_vectors.shape)
    if hasattr(vectorizer, 'transformed_desc_vectors'):
        print("üìè –†–∞–∑–º–µ—Ä —Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–∏—Ö –≤–µ–∫—Ç–æ—Ä–æ–≤ (–≤–Ω—É—Ç—Ä–µ–Ω–Ω–∏–π):", vectorizer.transformed_desc_vectors.shape)
    if hasattr(vectorizer, 'transformed_combined_vectors'):
        print("üìè –†–∞–∑–º–µ—Ä –æ–±—ä–µ–¥–∏–Ω–µ–Ω–Ω—ã—Ö –≤–µ–∫—Ç–æ—Ä–æ–≤ (–≤–Ω—É—Ç—Ä–µ–Ω–Ω–∏–π):", vectorizer.transformed_combined_vectors.shape)

In [65]:
def get_recommendations_for_game(model, train_df, game_name, top_n=5):
    """–ì–µ–Ω–µ—Ä–∏—Ä—É–µ—Ç —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏ –¥–ª—è –∑–∞–¥–∞–Ω–Ω–æ–π –∏–≥—Ä—ã –Ω–∞ –æ—Å–Ω–æ–≤–µ –∫–æ—Å–∏–Ω—É—Å–Ω–æ–π —Å—Ö–æ–∂–µ—Å—Ç–∏.

    –ò—Å–ø–æ–ª—å–∑—É–µ—Ç –æ–±—É—á–µ–Ω–Ω—É—é –º–æ–¥–µ–ª—å –¥–ª—è –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏–∏ –∏–≥—Ä –∏ —Ä–∞—Å—á–µ—Ç–∞ –∫–æ—Å–∏–Ω—É—Å–Ω–æ–π —Å—Ö–æ–∂–µ—Å—Ç–∏ –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–π.

    –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
        model: –û–±—É—á–µ–Ω–Ω–∞—è –º–æ–¥–µ–ª—å Pipeline.
        train_df (pd.DataFrame): DataFrame –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–∏.
        game_name (str): –ù–∞–∑–≤–∞–Ω–∏–µ –∏–≥—Ä—ã, –¥–ª—è –∫–æ—Ç–æ—Ä–æ–π –Ω—É–∂–Ω—ã —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏.
        top_n (int, optional): –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ä–µ–∫–æ–º–µ–Ω–¥—É–µ–º—ã—Ö –∏–≥—Ä. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é 5.

    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
        list: –°–ø–∏—Å–æ–∫ –Ω–∞–∑–≤–∞–Ω–∏–π —Ä–µ–∫–æ–º–µ–Ω–¥—É–µ–º—ã—Ö –∏–≥—Ä. –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç –ø—É—Å—Ç–æ–π —Å–ø–∏—Å–æ–∫, –µ—Å–ª–∏ –∏–≥—Ä–∞ –Ω–µ –Ω–∞–π–¥–µ–Ω–∞ –∏–ª–∏ train_df –Ω–µ —Å–æ–¥–µ—Ä–∂–∏—Ç –∫–æ–ª–æ–Ω–∫—É 'name'.
    """
    if 'name' not in train_df.columns:
        print("‚ùå –û—à–∏–±–∫–∞: DataFrame train_df –Ω–µ —Å–æ–¥–µ—Ä–∂–∏—Ç –∫–æ–ª–æ–Ω–∫—É 'name'.")
        sys.stdout.flush()
        return []

    game_row = train_df[train_df['name'] == game_name]

    if game_row.empty:
        print(f"‚ö†Ô∏è –ò–≥—Ä–∞ '{game_name}' –Ω–µ –Ω–∞–π–¥–µ–Ω–∞ –≤ train_df.")
        sys.stdout.flush()
        return []

    game_vector = model.transform(game_row)
    train_vectors = model.transform(train_df)

    similarity_scores = cosine_similarity(game_vector, train_vectors)[0]

    similarity_df = pd.DataFrame({'name': train_df['name'], 'similarity': similarity_scores})

    sorted_similarity_df = similarity_df.sort_values(by='similarity', ascending=False)

    recommendations_df = sorted_similarity_df[sorted_similarity_df['name'] != game_name].head(top_n)

    recommendations = recommendations_df['name'].tolist()

    return recommendations

In [66]:
def manual_hyperparameter_search(train_df, param_grid, results_file="manual_coherence_results.csv"):
    """–í—ã–ø–æ–ª–Ω—è–µ—Ç —Ä—É—á–Ω–æ–π –ø–æ–∏—Å–∫ –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ –Ω–∞ –æ—Å–Ω–æ–≤–µ –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç–∏.

    –ü–µ—Ä–µ–±–∏—Ä–∞–µ—Ç –∑–∞–¥–∞–Ω–Ω—É—é —Å–µ—Ç–∫—É –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤, –æ–±—É—á–∞–µ—Ç –º–æ–¥–µ–ª—å –¥–ª—è –∫–∞–∂–¥–æ–π –∫–æ–º–±–∏–Ω–∞—Ü–∏–∏ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤,
    –æ—Ü–µ–Ω–∏–≤–∞–µ—Ç –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å —Ç–µ–º—ã –∏ —Å–æ—Ö—Ä–∞–Ω—è–µ—Ç —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã.

    –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
        train_df (pd.DataFrame): DataFrame –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–∏.
        param_grid (dict): –°–µ—Ç–∫–∞ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ –¥–ª—è –ø–æ–∏—Å–∫–∞.
        results_file (str, optional): –ü—É—Ç—å –∫ —Ñ–∞–π–ª—É –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ CSV. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é "manual_coherence_results.csv".

    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
        pd.DataFrame: DataFrame —Å —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞–º–∏ –ø–æ–∏—Å–∫–∞ –ø–æ –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º.
    """
    results = []
    grid = ParameterGrid(param_grid)
    total_iterations = len(grid)
    for i, params in enumerate(grid):
        print('------------------------------------------------------------------------------------------')

        BLUE = '\033[94m'
        RESET = '\033[0m'

        start_time = time.time()
        print(f"üß™ –û—Ü–µ–Ω–∫–∞ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤: {params} \nüîÑ –ò—Ç–µ—Ä–∞—Ü–∏—è: ({i + 1}/{total_iterations})")
        try:
            vectorizer_params = {k.split('__')[1]: v for k, v in params.items() if 'vectorizer__' in k}
            vectorizer = CombinedVectorizer(**vectorizer_params)

            pipeline = Pipeline([('vectorizer', vectorizer)])
            pipeline.fit(train_df)

            tfidf_matrix_cuml = vectorizer.tfidf_cuml.transform(train_df['short_description_clean'])

            row_sums_cp = tfidf_matrix_cuml.sum(axis=1)

            row_sums = row_sums_cp.get()

            zero_vector_indices = np.where(row_sums == 0)[0]

            if len(zero_vector_indices) > 0:
                print(f"‚ö†Ô∏è –û–±–Ω–∞—Ä—É–∂–µ–Ω—ã –Ω—É–ª–µ–≤—ã–µ –≤–µ–∫—Ç–æ—Ä—ã TF-IDF –¥–ª—è {len(zero_vector_indices)} –∏–≥—Ä:")
                zero_vector_indices_np = np.array(zero_vector_indices)
                zero_vector_game_ids = train_df.iloc[zero_vector_indices_np].index.tolist()
                print(f"üÜî ID –∏–≥—Ä —Å –Ω—É–ª–µ–≤—ã–º–∏ –≤–µ–∫—Ç–æ—Ä–∞–º–∏: {zero_vector_game_ids}")

            tfidf_model = vectorizer.tfidf_cuml

            model = None
            diversity = -1
            intra_topic_diversity = -1
            model_type = None
            if vectorizer.nmf_params:
                model = vectorizer.nmf
                model_type = 'nmf'
                print("üé≠ –¢–µ–º—ã NMF:")
                if hasattr(model, 'components_'):
                    feature_names = tfidf_model.get_feature_names()
                    if isinstance(feature_names, cudf.core.series.Series):
                        feature_names = feature_names.to_pandas().tolist()
                    display_topics_with_diversity(model, feature_names)
                else:
                    print("‚ö†Ô∏è –ú–æ–¥–µ–ª—å NMF –Ω–µ –∏–º–µ–µ—Ç –∞—Ç—Ä–∏–±—É—Ç–∞ components_")
            elif vectorizer.lda_params:
                model = vectorizer.lda
                model_type = 'lda'
                print("üé≠ –¢–µ–º—ã LDA:")
                if hasattr(model, 'components_'):
                    feature_names = tfidf_model.get_feature_names()
                    if isinstance(feature_names, cudf.core.series.Series):
                        feature_names = feature_names.to_pandas().tolist()
                    display_topics_with_diversity(model, feature_names)
                else:
                    print("‚ö†Ô∏è –ú–æ–¥–µ–ª—å LDA –Ω–µ –∏–º–µ–µ—Ç –∞—Ç—Ä–∏–±—É—Ç–∞ components_")
            else:
                raise ValueError("‚ùå NMF –∏–ª–∏ LDA –Ω–µ –Ω–∞—Å—Ç—Ä–æ–µ–Ω—ã –≤ CombinedVectorizer.")

            texts_for_coherence = train_df['short_description_clean'].tolist()

            coherence = calculate_topic_coherence(model, tfidf_model, texts_for_coherence)

            diversity = -1
            intra_topic_diversity = -1

            if model is not None:
                if hasattr(model, 'components_'):
                    if np.isnan(model.components_).any():
                        print("‚ö†Ô∏è –û–±–Ω–∞—Ä—É–∂–µ–Ω—ã NaN –∑–Ω–∞—á–µ–Ω–∏—è –≤ model.components_ –ø–µ—Ä–µ–¥ –≤—ã—á–∏—Å–ª–µ–Ω–∏–µ–º diversity!")

            diversity = calculate_topic_diversity(model)

            if hasattr(model, 'components_'):
                feature_names = tfidf_model.get_feature_names()
                if isinstance(feature_names, cudf.core.series.Series):
                    feature_names = feature_names.to_pandas().tolist()
                intra_topic_diversity = calculate_intra_topic_diversity(model, feature_names)

            end_time = time.time()
            recommendations_stellaris = get_recommendations_for_game(pipeline, train_df, "Stellaris", top_n=5)

            results.append({
                'params': params,
                'coherence': coherence,
                'topic_diversity': diversity,
                'intra_topic_diversity': intra_topic_diversity,
                'time': end_time - start_time,
                'recommendations_stellaris': recommendations_stellaris
            })

            output_string = \
            f"""
            {BLUE}–ö–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å:{RESET}{BLUE}{coherence:>20.4f}{RESET}
            {BLUE}–†–∞–∑–Ω–æ–æ–±—Ä–∞–∑–∏–µ —Ç–µ–º:{RESET}{BLUE}{diversity:>17.4f}{RESET}
            {BLUE}–í–Ω—É—Ç—Ä–∏—Ç–æ–ø–∏–∫–æ–≤–æ–µ —Ä–∞–∑–Ω–æ–æ–±—Ä–∞–∑–∏–µ:{RESET}{BLUE}{intra_topic_diversity:>4.4f}{RESET}
            {BLUE}–í—Ä–µ–º—è:{RESET}{BLUE}{end_time - start_time:>24.2f} —Å–µ–∫—É–Ω–¥{RESET}
            {BLUE}–†–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏ –¥–ª—è Stellaris:{RESET}{BLUE}{', '.join(recommendations_stellaris) if recommendations_stellaris else '–ù–µ—Ç —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–π'}{RESET}
            """
            print(output_string)

        except Exception as e:
            end_time = time.time()
            print(f"‚ùå –û—à–∏–±–∫–∞ —Å –ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º–∏ {params}: {e}, –≤—Ä–µ–º—è: {end_time - start_time:.2f} —Å–µ–∫—É–Ω–¥")
            results.append({
                'params': params,
                'coherence': -1,
                'topic_diversity': -1,
                'intra_topic_diversity': -1,
                'time': end_time - start_time,
                'recommendations_stellaris': []
            })

        results_df = pd.DataFrame(results)
        results_df.to_csv(results_file, index=False)
        print(f"üíæ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ {results_file}")

    return results_df

In [67]:
def train_best_model(train_df, best_params, model_path="best_model.pkl"):
    """–û–±—É—á–∞–µ—Ç –ª—É—á—à—É—é –º–æ–¥–µ–ª—å –Ω–∞ –≤—Å–µ–º –æ–±—É—á–∞—é—â–µ–º –Ω–∞–±–æ—Ä–µ –¥–∞–Ω–Ω—ã—Ö.

    –ò—Å–ø–æ–ª—å–∑—É–µ—Ç –ª—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã, –Ω–∞–π–¥–µ–Ω–Ω—ã–µ –≤ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–µ –ø–æ–∏—Å–∫–∞, –¥–ª—è –æ–±—É—á–µ–Ω–∏—è —Ñ–∏–Ω–∞–ª—å–Ω–æ–π –º–æ–¥–µ–ª–∏ –∏ —Å–æ—Ö—Ä–∞–Ω—è–µ—Ç –µ–µ.

    –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
        train_df (pd.DataFrame): –ü–æ–ª–Ω—ã–π –æ–±—É—á–∞—é—â–∏–π –Ω–∞–±–æ—Ä –¥–∞–Ω–Ω—ã—Ö.
        best_params (dict): –õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã, –Ω–∞–π–¥–µ–Ω–Ω—ã–µ –≤ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–µ –ø–æ–∏—Å–∫–∞ –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤.
        model_path (str, optional): –ü—É—Ç—å –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏. –ü–æ —É–º–æ–ª—á–∞–Ω–∏—é "best_model.pkl".

    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
        Pipeline: –û–±—É—á–µ–Ω–Ω–∞—è –ª—É—á—à–∞—è –º–æ–¥–µ–ª—å.
    """
    print("üöÄ –ù–∞—á–∞–ª–æ –æ–±—É—á–µ–Ω–∏—è –ª—É—á—à–µ–π –º–æ–¥–µ–ª–∏...")
    start_time = time.time()

    best_model = Pipeline([
        ('vectorizer', CombinedVectorizer()),
    ])
    best_model.set_params(**best_params)
    best_model.fit(train_df)

    with open(model_path, 'wb') as f:
        pickle.dump(best_model, f)
    print(f"üíæ –õ—É—á—à–∞—è –º–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –ø–æ –ø—É—Ç–∏: {model_path}")

    end_time = time.time()
    print(f"‚úÖ –û–±—É—á–µ–Ω–∏–µ –ª—É—á—à–µ–π –º–æ–¥–µ–ª–∏ –∑–∞–≤–µ—Ä—à–µ–Ω–æ –∑–∞ {end_time - start_time:.2f} —Å–µ–∫—É–Ω–¥")
    return best_model

In [68]:
param_grid_nmf = {
    'vectorizer__owners_method': ['log_scale'],
    'vectorizer__multilabel_params': [{'sparse_output': True}],
    'vectorizer__nmf_params': [
        {'n_components': 5, 'init': 'nndsvda', 'solver': 'mu', 'beta_loss': 'frobenius'},
        {'n_components': 25, 'init': 'nndsvda', 'solver': 'mu', 'beta_loss': 'frobenius'},
        {'n_components': 50, 'init': 'nndsvda', 'solver': 'mu', 'beta_loss': 'frobenius'},
        {'n_components': 100, 'init': 'nndsvda', 'solver': 'mu', 'beta_loss': 'frobenius'},
        {'n_components': 200, 'init': 'nndsvda', 'solver': 'mu', 'beta_loss': 'frobenius'},
    ],
    'vectorizer__lda_params': [None],
    'vectorizer__tfidf_cuml_params': [{'max_features': 10000}],
    'vectorizer__tag_weight': [1.0]
}

In [32]:
param_grid_lda = {
    'vectorizer__owners_method': ['log_scale'],
    'vectorizer__multilabel_params': [{'sparse_output': True}],
    'vectorizer__nmf_params': [None],
    'vectorizer__lda_params': [
        {'n_components': 10, 'learning_method': 'batch', 'random_state': 42},
        {'n_components': 10, 'learning_method': 'online', 'learning_offset': 10., 'random_state': 42},
        {'n_components': 50, 'learning_method': 'batch', 'random_state': 42},
        {'n_components': 50, 'learning_method': 'online', 'learning_offset': 10., 'random_state': 42},
        {'n_components': 150, 'learning_method': 'batch', 'random_state': 42},
        {'n_components': 150, 'learning_method': 'online', 'learning_offset': 10., 'random_state': 42},
    ],
    'vectorizer__tfidf_cuml_params': [{'max_features': 10000}],
    'vectorizer__tag_weight': [1.0]
}

In [33]:
param_grid_tfidf_max_features = {
    'vectorizer__owners_method': ['log_scale'],
    'vectorizer__multilabel_params': [{'sparse_output': True}],
    'vectorizer__nmf_params': [
        {'n_components': 50, 'init': 'nndsvda', 'solver': 'mu', 'beta_loss': 'frobenius'}
    ],
    'vectorizer__lda_params': [None],
    'vectorizer__tfidf_cuml_params': [
        {'max_features': None},
        {'max_features': 2000},
        {'max_features': 5000},
        {'max_features': 10000},
        {'max_features': 20000}
    ],
    'vectorizer__tag_weight': [1.0]
}

In [34]:
param_grid_tag_weight = {
    'vectorizer__owners_method': ['log_scale'],
    'vectorizer__multilabel_params': [{'sparse_output': True}],
    'vectorizer__nmf_params': [
        {'n_components': 50, 'init': 'nndsvda', 'solver': 'mu', 'beta_loss': 'frobenius'}
    ],
    'vectorizer__lda_params': [None],
    'vectorizer__tfidf_cuml_params': [{'max_features': 10000}],
    'vectorizer__tag_weight': [
        0.5,
        1.0,
        2.0,
        3.0,
        5.0
    ]
}

In [None]:
manual_results_nmf = manual_hyperparameter_search(
    train_df, param_grid_nmf, results_file="manual_coherence_results_nmf.csv"
)

In [None]:
print("\nüìä –í—Å–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –ø–æ–∏—Å–∫–∞ NMF:")
print(manual_results_nmf)

In [37]:
best_manual_result_nmf = manual_results_nmf.sort_values(by='coherence', ascending=False).iloc[0]
best_manual_params_nmf = best_manual_result_nmf['params']

In [None]:
print(f"\nüèÜ –õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã NMF, –Ω–∞–π–¥–µ–Ω–Ω—ã–µ –ø–æ–∏—Å–∫–æ–º –Ω–∞ –æ—Å–Ω–æ–≤–µ –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç–∏: {best_manual_params_nmf}")

In [None]:
manual_results_lda = manual_hyperparameter_search(
    train_df, param_grid_lda, results_file="manual_coherence_results_lda.csv"
)

In [None]:
print("\nüìä –í—Å–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –ø–æ–∏—Å–∫–∞ LDA:")
print(manual_results_lda)

In [41]:
best_manual_result_lda = manual_results_lda.sort_values(by='coherence', ascending=False).iloc[0]
best_manual_params_lda = best_manual_result_lda['params']

In [None]:
print(f"\nüèÜ –õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã LDA, –Ω–∞–π–¥–µ–Ω–Ω—ã–µ –ø–æ–∏—Å–∫–æ–º –Ω–∞ –æ—Å–Ω–æ–≤–µ –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç–∏: {best_manual_params_lda}")

In [None]:
manual_results_max_features = manual_hyperparameter_search(
    train_df, param_grid_nmf, results_file="manual_coherence_results_max_features.csv"
)

In [None]:
print("\nüìä –í—Å–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –ø–æ–∏—Å–∫–∞ Max Features:")
print(manual_results_max_features)

In [45]:
best_manual_result_max_features = manual_results_max_features.sort_values(by='coherence', ascending=False).iloc[0]
best_manual_params_max_features = best_manual_result_max_features['params']

In [None]:
print(f"\nüèÜ –õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã Max Features, –Ω–∞–π–¥–µ–Ω–Ω—ã–µ –ø–æ–∏—Å–∫–æ–º –Ω–∞ –æ—Å–Ω–æ–≤–µ –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç–∏: {best_manual_params_max_features}")

In [None]:
manual_results_tag_weight = manual_hyperparameter_search(
    train_df, param_grid_tag_weight, results_file="manual_coherence_results_tag_weight.csv"
)

In [None]:
print("\nüìä –í—Å–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –ø–æ–∏—Å–∫–∞ Tag Weight:")
print(manual_results_max_features)

In [49]:
best_manual_result_tag_weight = manual_results_tag_weight.sort_values(by='coherence', ascending=False).iloc[0]
best_manual_params_tag_weight = best_manual_result_tag_weight['params']

In [None]:
print(f"\nüèÜ –õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã Tag Weight, –Ω–∞–π–¥–µ–Ω–Ω—ã–µ –ø–æ–∏—Å–∫–æ–º –Ω–∞ –æ—Å–Ω–æ–≤–µ –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç–∏: {best_manual_params_tag_weight}")

---

In [34]:
best_manual_params = {
    'vectorizer__owners_method': ['log_scale'],
    'vectorizer__multilabel_params': [{'sparse_output': True}],
    'vectorizer__nmf_params': 
        [{'n_components': 50, 'init': 'nndsvda', 'solver': 'mu', 'beta_loss': 'frobenius'}],
    'vectorizer__lda_params': [None],
    'vectorizer__tfidf_cuml_params': [{'max_features': 10000}],
    'vectorizer__tag_weight': [2.5]
}

In [None]:
manual_result = manual_hyperparameter_search(
    train_df, best_manual_params, results_file="manual_coherence_result.csv"
)

In [36]:
manual_result = manual_result.sort_values(by='coherence', ascending=False).iloc[0]
manual_params = manual_result['params']

In [None]:
manual_model = train_best_model(train_df, manual_params, model_path=model_path)