### **1. Предобработка текста**

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('joined_with_onehot_to_use_final.csv')
df

In [None]:
int_cols = ['year_of_foundation', 'cnt_houses_built', 'years_on_market']
for col in int_cols:
    df[col] = df[col].astype(int)

In [None]:
df = df[df['sentiment_label'] != 1].reset_index(drop=True)

In [None]:
!pip install Natasha

In [None]:
from natasha import Segmenter, MorphVocab, NewsEmbedding, NewsMorphTagger, Doc

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

def lemmatize_text(text: str) -> str:
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)

    lemmas = [token.lemma for token in doc.tokens if token.lemma is not None and token.lemma.isalpha()]
    return ' '.join(lemmas)

In [None]:
df['lemmatized_text'] = df['full_review_text'].astype(str).apply(lemmatize_text)

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
text_embeddings = model.encode(df['lemmatized_text'].tolist(), show_progress_bar=True)
text_embeddings = np.array(text_embeddings)

In [None]:
from sklearn.preprocessing import MinMaxScaler

numeric_features = ['rating', 'years_on_market', 'avg_sqm_price', 'cnt_houses_built']

df['avg_sqm_price_log'] = np.log1p(df['avg_sqm_price'])
df['cnt_houses_built_log'] = np.log1p(df['cnt_houses_built'])
df['rating_log'] = df['rating']
df['years_on_market_log'] = np.log1p(df['years_on_market'])

scaler = MinMaxScaler()
df[['rating_norm', 'years_on_market_norm', 'avg_sqm_price_norm', 'cnt_houses_built_norm']] = scaler.fit_transform(
    df[['rating_log', 'years_on_market_log', 'avg_sqm_price_log', 'cnt_houses_built_log']])

In [None]:
# Числовые признаки (нормализованные) с весом 0.3
numeric_matrix = df[['rating_norm', 'years_on_market_norm', 'avg_sqm_price_norm', 'cnt_houses_built_norm']].values
numeric_matrix_weighted = numeric_matrix * 0.3

# Категориальные one-hot признаки с весом 0.1
category_columns = ["Эконом", "Комфорт", "Бизнес", "Премиум"]
if all(col in df.columns for col in category_columns):
    category_matrix = df[category_columns].values.astype(float)
else:
    category_matrix = np.empty((len(df), 0))

category_matrix_weighted = category_matrix * 0.1

In [None]:
# Объединение эмбеддингов и всех признаков в одну матрицу признаков X_all
X_all = np.hstack([text_embeddings, numeric_matrix_weighted, category_matrix_weighted])
print("Размер матрицы признаков X_all:", X_all.shape)

### **Кластеризация отзывов K-Means/HDBSCAN**

In [None]:
import umap.umap_ as umap

reducer_100 = umap.UMAP(n_components=100, metric='cosine', random_state=42)
X_reduced_100 = reducer_100.fit_transform(X_all)

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Определение оптимального числа кластеров (силуэт и локоть)
possible_k = list(range(2, 16))
silhouette_scores = []
inertias = []
X_for_clustering = X_reduced_100.copy()

from sklearn.preprocessing import normalize
X_for_clustering = normalize(X_for_clustering)

for k in possible_k:
    kmeans_temp = KMeans(n_clusters=k, init='k-means++', random_state=42, n_init='auto')
    labels_temp = kmeans_temp.fit_predict(X_for_clustering)
    inertias.append(kmeans_temp.inertia_)

    sil_score = silhouette_score(X_for_clustering, labels_temp, metric='euclidean')
    silhouette_scores.append(sil_score)
    print(f"K={k}: силуэт={sil_score:.3f}, инерция={kmeans_temp.inertia_:.0f}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,4))
plt.plot(possible_k, silhouette_scores, marker='o')
plt.xlabel('Число кластеров K')
plt.ylabel('Средний коэффициент силуэта')
plt.title('Метод силуэта для выбора K')
plt.grid(True)
plt.show()

plt.figure(figsize=(6,4))
plt.plot(possible_k, inertias, marker='o', color='orange')
plt.xlabel('Число кластеров K')
plt.ylabel('Inertia (сумма квадратов расстояний)')
plt.title('Метод локтя для выбора K')
plt.grid(True)
plt.show()

In [None]:
# Кластеризация KMeans с выбранным K по косинусной метрике
n_clusters = 6

kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init='auto')
labels_km = kmeans.fit_predict(X_for_clustering)

In [None]:
reducer_2d = umap.UMAP(n_components=2, metric='cosine', random_state=42)
X_reduced_2d = reducer_2d.fit_transform(X_all)

plt.figure(figsize=(8,6))
scatter = plt.scatter(X_reduced_2d[:,0], X_reduced_2d[:,1], c=labels_km, cmap='tab20', s=5, alpha=0.6)
plt.colorbar(label='KMeans cluster')
plt.title(f"KMeans (cosine) with {n_clusters} clusters")
plt.show()

In [None]:
df['cluster_km'] = labels_km

In [None]:
df

### **Тематическое моделирование: c-TF-IDF для ключевых слов кластеров**

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

russian_stopwords = stopwords.words("russian")

# Формируем DataFrame для кластеров KMeans
docs_df_kmeans = pd.DataFrame({
    'Doc': df['lemmatized_text'].astype(str),
    'Topic': labels_km,
    'Doc_ID': range(len(df))
})

# Объединяем тексты по темам
docs_per_topic_kmeans = docs_df_kmeans.groupby(['Topic'], as_index=False).agg({'Doc': ' '.join})

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer


def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range,
                            stop_words=russian_stopwords).fit(documents)

    t = count.transform(documents).toarray()
    w = t.sum(axis=1)

    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)

    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

tf_idf, count = c_tf_idf(docs_per_topic_kmeans.Doc.values, m=len(docs_df_kmeans))

In [None]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names_out()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc":  "Size"},axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic_kmeans, n=20)
topic_sizes = extract_topic_sizes(docs_df_kmeans); topic_sizes.head(10)

In [None]:
for topic_id in topic_sizes['Topic'].head(10):
    print(f"\nТема {topic_id} (размер = {topic_sizes[topic_sizes['Topic'] == topic_id]['Size'].values[0]})")
    for word, score in top_n_words[topic_id]:
        print(f"{word:<20} {score}")

### Linear Regression S-score

In [None]:
from sklearn.linear_model import LinearRegression

# Формируем матрицу признаков для регрессии: one-hot для кластеров
cluster_labels = df['cluster_km'].values.reshape(-1, 1)

# Преобразуем в one-hot (каждый кластер - отдельный признак)
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_clusters_ohe = enc.fit_transform(cluster_labels)
print("Размер матрицы one-hot кластеров:", X_clusters_ohe.shape)

In [None]:
# 3.2. Линейная регрессия: rating ~ cluster_dummies (без intercept)
y = df['rating'].values
lr = LinearRegression(fit_intercept=False)
lr.fit(X_clusters_ohe, y)
# коэффициенты для каждого кластера (в порядке возрастания номеров кластеров)
coeffs = lr.coef_
# Проверим соответствие количества коэффициентов количеству кластеров
print(f"Число коэффициентов регрессии: {len(coeffs)}, число кластеров: {X_clusters_ohe.shape[1]}")

In [None]:
# 3.3. Нормализация весов тем (коэффициентов) к диапазону [0,1]
coef_min = coeffs.min()
coef_max = coeffs.max()
if coef_max - coef_min == 0:
    # На случай, если все коэффициенты равны (что маловероятно) - избежим деления на 0
    normalized_weights = np.zeros_like(coeffs)
else:
    normalized_weights = (coeffs - coef_min) / (coef_max - coef_min)

In [None]:
# 3.4. Расчет S-score для каждого застройщика
# Подготовим словарь весов тем (кластеров)
cluster_weights = {cluster: w for cluster, w in enumerate(normalized_weights)}

# Определим метки для позитивных и негативных отзывов
# Предположительно: sentiment_label = 2 для позитивных, = 0 для негативных (после фильтрации)
pos_label = 2
neg_label = 0

# Считаем общее число отзывов у каждого застройщика (для вычисления долей)
total_reviews_by_dev = df.groupby('developer').size().to_dict()

s_scores_km = {}  # словарь для S-score
# Группируем по разработчику и кластеру с разметкой позитив/негатив
pos_counts = df[df['sentiment_label'] == pos_label].groupby(['developer', 'cluster_km']).size().to_dict()
neg_counts = df[df['sentiment_label'] == neg_label].groupby(['developer', 'cluster_km']).size().to_dict()

developers = df['developer'].unique()
for dev in developers:
    total = total_reviews_by_dev.get(dev, 0)
    if total == 0:
        continue
    s_value = 0.0
    # Для каждого кластера суммируем взвешенную разность долей
    for cluster, w in cluster_weights.items():
        # Число позитивных и негативных отзывов данного застройщика в этом кластере
        pos_count = pos_counts.get((dev, cluster), 0)
        neg_count = neg_counts.get((dev, cluster), 0)
        # Доли (от общего числа отзывов застройщика)
        p_share = pos_count / total
        n_share = neg_count / total
        # Вклад в S-score
        s_value += w * (p_share - n_share)
    s_scores_km[dev] = s_value

In [None]:
s_score_km_df = pd.DataFrame({
    'developer': list(s_scores_km.keys()),
    's_score_kmeans': list(s_scores_km.values())
})
print("Пример расчета S-score:")
print(s_score_km_df.head())

In [None]:
# 3.6. Оценка качества регрессионной модели
from sklearn.metrics import mean_absolute_error, mean_squared_error

y_pred = lr.predict(X_clusters_ohe)
r2 = lr.score(X_clusters_ohe, y)
mae = mean_absolute_error(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f"Качество модели: R^2 = {r2:.3f}, MAE = {mae:.3f}, RMSE = {rmse:.3f}")

In [None]:
s_score_km_df = s_score_km_df.sort_values(by='s_score_kmeans', ascending=False).reset_index(drop=True)
s_score_km_df

#### S-score with log-regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from scipy.special import logit

In [None]:
# Получаем признаки
X = df[['cluster_km']]
y = df['sentiment_label']

# One-hot кодируем кластеры
encoder = OneHotEncoder(sparse_output=False, drop=None)
X_encoded = encoder.fit_transform(X)

In [None]:
# Обучаем логистическую регрессию
logreg = LogisticRegression(fit_intercept=False, solver='liblinear')
logreg.fit(X_encoded, y)

# Получаем коэффициенты
cluster_coeffs = logreg.coef_[0]
cluster_weights_logreg = {i: coeff for i, coeff in enumerate(cluster_coeffs)}

In [None]:
# Подсчет количества отзывов по застройщикам и кластерам
pos_label = 2
neg_label = 0

total_reviews_by_dev = df.groupby('developer').size().to_dict()
pos_counts = df[df['sentiment_label'] == pos_label].groupby(['developer', 'cluster_km']).size().to_dict()
neg_counts = df[df['sentiment_label'] == neg_label].groupby(['developer', 'cluster_km']).size().to_dict()

In [None]:
# Расчет S-score логистической
s_scores_logreg = {}
developers = df['developer'].unique()
for dev in developers:
    total = total_reviews_by_dev.get(dev, 0)
    if total == 0:
        continue
    s_value = 0.0
    for cluster, w in cluster_weights_logreg.items():
        pos_count = pos_counts.get((dev, cluster), 0)
        neg_count = neg_counts.get((dev, cluster), 0)
        p_share = pos_count / total
        n_share = neg_count / total
        s_value += w * (p_share - n_share)
    s_scores_logreg[dev] = s_value

# Нормализация
s_values = np.array(list(s_scores_logreg.values()))
s_values_norm = (s_values - s_values.min()) / (s_values.max() - s_values.min())

In [None]:
s_score_df_logreg = pd.DataFrame({
    'developer': list(s_scores_logreg.keys()),
    's_score_logreg': s_values_norm
})

s_score_df_logreg.sort_values(by='s_score_logreg', ascending=False).reset_index(drop=True)

In [None]:
# Средний рейтинг по девелоперам
mean_ratings = df.groupby('developer')['rating'].mean().reset_index()
mean_ratings.columns = ['developer', 'avg_rating']

s_min = s_score_df_logreg['s_score_logreg'].min()
s_max = s_score_df_logreg['s_score_logreg'].max()
s_score_df_logreg['s_sco=re_logreg_scaled'] = 5 * (s_score_df_logreg['s_score_logreg'] - s_min) / (s_max - s_min)

# Объединяем с индексом
validation_df = s_score_df_logreg.merge(mean_ratings, on='developer')
validation_df.sort_values(by='avg_rating', ascending=False).reset_index(drop=True)

In [None]:
from sklearn.metrics import roc_auc_score
from scipy.stats import spearmanr

y_pred = logreg.predict_proba(X_encoded)[:, 1]

# McFadden's Pseudo R²
epsilon = 1e-15
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)

p_null = np.mean(y)
p_null = np.clip(p_null, epsilon, 1 - epsilon)  # защита от log(0)

log_likelihood_full = np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
log_likelihood_null = np.sum(y * np.log(p_null) + (1 - y) * np.log(1 - p_null))
pseudo_r2 = 1 - (log_likelihood_full / log_likelihood_null)
print(f"McFadden's Pseudo R²: {pseudo_r2:.4f}")

# ROC - AUC
auc = roc_auc_score(y, y_pred)
print(f"ROC-AUC: {auc:.4f}")

# Gini
gini = 2 * auc - 1
print(f"Gini: {gini:.4f}")

# Корреляция Спирмана
y_pred_val = validation_df['s_score_logreg_scaled']
y_true_val = validation_df['avg_rating']
r_spear, _ = spearmanr(y_true_val, y_pred_val)
print(f"Spearman rank correlation between normalized s-score and average rating: {r_spear:.3f}")

In [None]:
import seaborn as sns

# Строим scatterplot
plt.figure(figsize=(10, 6))
sns.regplot(
    data=validation_df,
    x='s_score_logreg_scaled',
    y='avg_rating',
    ci=None,
    scatter_kws={'s': 50, 'alpha': 0.8},
    line_kws={'color': 'red'}
)
plt.title("Correlation between S-score and average rating of developer (Spearman)", fontsize=14)
plt.xlabel("Normilized S-score", fontsize=12)
plt.ylabel("Average rating (1–5)", fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()

### HDBSCAN clusterisation

In [None]:
!pip install optuna

In [None]:
import hdbscan
import optuna
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import normalize

X_for_hdbscan = normalize(X_reduced_100)

# Вариант 1: Подбор через Optuna (максимизация silhouette_score)
def objective(trial):
    min_cluster_size = trial.suggest_int("min_cluster_size", 10, 50)
    min_samples = trial.suggest_int("min_samples", 1, 20)

    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric="euclidean"
    )
    labels = clusterer.fit_predict(X_for_hdbscan)

    mask = labels != -1
    if len(set(labels[mask])) < 2:
        return -1.0

    score = silhouette_score(X_for_hdbscan[mask], labels[mask])
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

best_params = study.best_params

In [None]:
print("Best hyperparameters for silhouette:", study.best_params)

In [None]:
def optimize_min_noise():
    best_ratio = 1.0
    best_config = None

    for min_cluster_size in range(10, 51, 5):
        for min_samples in [1, 3, 5, 10, 15, 20]:
            clusterer = hdbscan.HDBSCAN(
                min_cluster_size=min_cluster_size,
                min_samples=min_samples,
                metric='euclidean'
            )
            labels = clusterer.fit_predict(X_for_hdbscan)
            noise_ratio = np.mean(labels == -1)

            if noise_ratio < best_ratio and len(set(labels)) > 2:
                best_ratio = noise_ratio
                best_config = {
                    "min_cluster_size": min_cluster_size,
                    "min_samples": min_samples,
                    "noise_ratio": noise_ratio
                }

    return best_config

best_noise_config = optimize_min_noise()

In [None]:
print("Best hyperparameters for noise minimization:", best_noise_config)

In [None]:
X_for_hdbscan = X_reduced_100

X_for_hdbscan = normalize(X_for_hdbscan)

clusterer = hdbscan.HDBSCAN(min_cluster_size=15, min_samples=9, metric='euclidean')
labels_hdb = clusterer.fit_predict(X_for_hdbscan)
n_clusters_hdb = len(set(labels_hdb) - {-1})
print(f"HDBSCAN нашел кластеров: {n_clusters_hdb}, выбросов: {(labels_hdb == -1).sum()}")

In [None]:
# 4.3. Визуализация кластеров HDBSCAN на 2D проекции
plt.figure(figsize=(8,6))
outlier_mask = (labels_hdb == -1)
# Точки-выбросы
plt.scatter(X_reduced_2d[outlier_mask,0], X_reduced_2d[outlier_mask,1], c='lightgray', s=5, alpha=0.5, label='outliers')
# Точки кластеров
plt.scatter(X_reduced_2d[~outlier_mask,0], X_reduced_2d[~outlier_mask,1], c=labels_hdb[~outlier_mask], cmap='tab20', s=5, alpha=0.6)
plt.colorbar()
plt.title('Кластеры HDBSCAN (UMAP 2D проекция)')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.legend(loc='upper right')
plt.savefig('hdbscan_clusters_2d.png')
plt.show()

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

russian_stopwords = stopwords.words("russian")

# Формируем DataFrame для кластеров KMeans
docs_df_hdbscan = pd.DataFrame({
    'Doc': df['lemmatized_text'].astype(str),
    'Topic': labels_hdb,
    'Doc_ID': range(len(df))
})

# Объединяем тексты по темам
docs_per_topic_hdbscan = docs_df_hdbscan.groupby(['Topic'], as_index=False).agg({'Doc': ' '.join})

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer


def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range,
                            stop_words=russian_stopwords).fit(documents)

    t = count.transform(documents).toarray()
    w = t.sum(axis=1)

    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)

    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

tf_idf_hdbscan, count_hdbscan = c_tf_idf(docs_per_topic_hdbscan.Doc.values, m=len(docs_df_hdbscan))

In [None]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names_out()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc":  "Size"},axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words_hdbscan = extract_top_n_words_per_topic(tf_idf_hdbscan, count_hdbscan, docs_per_topic_hdbscan, n=20)
topic_sizes_hdbscan = extract_topic_sizes(docs_df_hdbscan); topic_sizes_hdbscan.head(10)

In [None]:
for topic_id in topic_sizes_hdbscan['Topic'].head(10):
    print(f"\nТема {topic_id} (размер = {topic_sizes_hdbscan[topic_sizes_hdbscan['Topic'] == topic_id]['Size'].values[0]})")
    for word, score in top_n_words_hdbscan[topic_id]:
        print(f"{word:<20} {score}")

In [None]:
df["cluster_hdb"] = labels_hdb

In [None]:
# One-hot представление кластеров HDBSCAN для регрессии
cluster_labels_hdb = df['cluster_hdb'].values.reshape(-1, 1)
X_clusters_hdb_ohe = enc.fit_transform(cluster_labels_hdb)
print("One-hot форма кластеров HDBSCAN:", X_clusters_hdb_ohe.shape)

In [None]:
# Линейная регрессия: rating ~ cluster_hdb (OHE), без intercept
y_hdb = df['rating'].values
lr_hdb = LinearRegression(fit_intercept=False)
lr_hdb.fit(X_clusters_hdb_ohe, y_hdb)
coeffs_hdb = lr_hdb.coef_
print(f"Количество кластеров HDBSCAN: {X_clusters_hdb_ohe.shape[1]}, коэффициентов: {len(coeffs_hdb)}")

In [None]:
# Нормализация весов тем HDBSCAN
coef_min_hdb = coeffs_hdb.min()
coef_max_hdb = coeffs_hdb.max()
if coef_max_hdb - coef_min_hdb == 0:
    normalized_weights_hdb = np.zeros_like(coeffs_hdb)
else:
    normalized_weights_hdb = (coeffs_hdb - coef_min_hdb) / (coef_max_hdb - coef_min_hdb)
cluster_weights_hdb = {cluster: w for cluster, w in enumerate(normalized_weights_hdb)}

In [None]:
# Расчет S-score для каждой компании (HDBSCAN)
total_reviews_by_dev_hdb = df.groupby('developer').size().to_dict()
pos_counts_hdb = df[df['sentiment_label'] == pos_label].groupby(['developer', 'cluster_hdb']).size().to_dict()
neg_counts_hdb = df[df['sentiment_label'] == neg_label].groupby(['developer', 'cluster_hdb']).size().to_dict()

s_scores_hdb = {}
for dev in df['developer'].unique():
    total = total_reviews_by_dev_hdb.get(dev, 0)
    if total == 0:
        continue
    s_value = 0.0
    for cluster, w in cluster_weights_hdb.items():
        pos_count = pos_counts_hdb.get((dev, cluster), 0)
        neg_count = neg_counts_hdb.get((dev, cluster), 0)
        p_share = pos_count / total
        n_share = neg_count / total
        s_value += w * (p_share - n_share)
    s_scores_hdb[dev] = s_value

s_score_hdb_df = pd.DataFrame({
    'developer': list(s_scores_hdb.keys()),
    's_score_hdbscan': list(s_scores_hdb.values())
})
print("Пример S-score (HDBSCAN) - топ 5 записей:")
print(s_score_hdb_df.head())

In [None]:
s_score_hdb_df.sort_values(by='s_score_hdbscan', ascending=False).reset_index(drop=True)

In [None]:
y_pred_hdb = lr_hdb.predict(X_clusters_hdb_ohe)
r2_hdb = lr_hdb.score(X_clusters_hdb_ohe, y_hdb)
mae_hdb = mean_absolute_error(y_hdb, y_pred_hdb)
rmse_hdb = np.sqrt(mean_squared_error(y_hdb, y_pred_hdb))
print(f"Качество модели (HDBSCAN кластеры): R^2 = {r2_hdb:.3f}, MAE = {mae_hdb:.3f}, RMSE = {rmse_hdb:.3f}")