# Data and imports


In [18]:
import polars as pl
import unicodedata
from pathlib import Path
import html
import re
from tqdm.auto import tqdm  # Added tqdm import
from transformers.pipelines import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from keyphrase_vectorizers import KeyphraseCountVectorizer, KeyphraseTfidfVectorizer
import nltk
from nltk.corpus import stopwords
import torch, platform
import random
import numpy as np
from keybert import KeyBERT
import pymorphy3
import spacy

# Path
PATH_TO_ITEMS = Path().cwd().parent / "data" / "modified_data" / "items.parquet"
PATH_TO_NEW_ITEMS = Path().cwd().parent / "data" / "modified_data" / "new_items.parquet"

In [2]:
# For MacOs to check if MPS is available

print("macOS:", platform.mac_ver()[0], "  PyTorch:", torch.__version__)
print("MPS available:", torch.backends.mps.is_available())
print("MPS built‑in:", torch.backends.mps.is_built())

spacy.prefer_gpu()
print(f"spaCy GPU usage: {'Enabled' if spacy.require_gpu() else 'Disabled'}")

macOS:    PyTorch: 2.7.0+cu126
MPS available: False
MPS built‑in: False
spaCy GPU usage: Enabled


In [3]:
# Specifing seed value for reproducibility

SEED = 42

random.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)
pl.set_random_seed(SEED)


In [4]:
items_df = pl.read_parquet(PATH_TO_ITEMS)

# Checking if there is any null value in the DataFrame
null_sum = items_df.null_count().sum_horizontal()[0]

assert null_sum == 0, f"There are {null_sum} null values in the DataFrame"


# Clustering


## Keywords problem


- Проверим, насколько наши текущие ключевые слова подходят для кластеризации


In [9]:
# Print samples of dataset's keywords
sample_dataset = (
    items_df.sample(2)
    .select("keywords", "countries", "title", "description")
    .rows(named=True)
)

for item in sample_dataset:
    print(f"Title: {item['title']}")
    print(f"Countries: {item['countries']}")
    print(f"Keywords: {item['keywords'][:150]}...")
    print("-" * 80)

Title: с прицепом
Countries: сша
Keywords: прицепом, 2017, США...
--------------------------------------------------------------------------------
Title: виктория
Countries: великобритания
Keywords: Виктория, 2016, Великобритания, брак, короли, королевы, коррупция, отцы, дети, политика, свадьбы, семейные, проблемы, семья, отношения, отношения, муж...
--------------------------------------------------------------------------------


- Во-первых, идет дубляция страны, что не хорошо. Зачем нам заниматься увеличением токенов, которые у нас повторяются в графе Стран
- Видно, что предложения по типу "борьба за выживание" разделены запятой, а не точкой с запятой

- Необходимо придумать более осмысленные ключевые слова!


### Bert for keywords


In [5]:
# Model

# If mac, i am using Mac for this example
# device = "mps" if torch.backends.mps.is_available() else "cpu"

# If you have gpu uncomment the line below
device = "cuda" if torch.cuda.is_available() else "cpu"

model = SentenceTransformer("cointegrated/LaBSE-en-ru", device=device)

kw_model = KeyBERT(model)  # type: ignore

In [6]:
import nltk

nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")
print(f"Loaded {len(russian_stopwords)} Russian stopwords")

Loaded 151 Russian stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def preclean(text: str) -> str:
    text = html.unescape(text)  # &amp; → &
    text = unicodedata.normalize("NFKC", text)  # длинные тире → обычные
    text = re.sub(r"<[^>]+>", " ", text)  # убираем HTML
    text = re.sub(r"\d{4}", " ", text)  # опц.: убираем года
    text = re.sub(r"[^\S\n]+", " ", text)  # множественные пробелы
    return text.strip()


def extract_cleaned_kw(text: str) -> str:
    clean_text = preclean(text)

    extracted_keywords_with_scores = kw_model.extract_keywords(
        clean_text,  # Corrected from 'text' to 'clean_text'
        use_mmr=True,
        use_maxsum=True,
        top_n=8,
        threshold=0.35,
        stop_words=russian_stopwords,
        keyphrase_ngram_range=(1, 2),
    )

    keyword_phrases = [phrase for phrase, score in extracted_keywords_with_scores]

    return ", ".join(keyword_phrases)

In [8]:
for item in sample_dataset:
    print(f"current keywords: {item['keywords']}")
    print(f"New keywords: {extract_cleaned_kw(item['description'])}")
    print("-" * 80)

NameError: name 'sample_dataset' is not defined

- Перепробовал кучу способов, ничего лучше не смог найти.


In [10]:
from tqdm import tqdm

# Calculate the number of items that will be processed by extract_cleaned_kw
num_items_to_process = items_df.filter(pl.col("description") != "-").height

# Initialize tqdm progress bar
pbar = tqdm(total=num_items_to_process, desc="Extracting Keywords")


# Define a wrapper function that calls the original extract_cleaned_kw and updates the progress bar
def extract_cleaned_kw_with_pbar_update(text: str) -> str:
    result = extract_cleaned_kw(text)  # Call the original function
    pbar.update(1)
    return result


# Apply the transformation
items_df = items_df.with_columns(
    pl.when(pl.col("description") != "-")
    .then(
        pl.col("description").map_elements(
            extract_cleaned_kw_with_pbar_update, return_dtype=pl.Utf8
        )
    )
    .otherwise(pl.col("keywords"))
    .alias("new_keywords")
)

# Close the progress bar
pbar.close()

Extracting Keywords: 15963it [28:29,  9.34it/s]                           
Extracting Keywords: 15963it [28:29,  9.34it/s]


In [None]:
items_df.to_parquet(
    PATH_TO_NEW_ITEMS,
)

AttributeError: 'PosixPath' object has no attribute 'child'