In [None]:
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

In [None]:
# Загрузка датасета
chat_df = pd.read_csv('chat_data/chat_history 2024-11-08 v3.csv', dtype={
    'id': 'int32',
    'from_id': 'str',
    'text': 'str',
    'reply_to_id': 'int32'
})
chat_df.date = pd.to_datetime(chat_df.date)
chat_df.edited = pd.to_datetime(chat_df.edited)
chat_df = chat_df[~chat_df.text.str.startswith('File: <')]


def load_embeddings(folder, m=0, n=-1):
    files = os.listdir(folder)
    if not isinstance(m, int) or m < 0 or m >= n:
        m = 0
        
    if not isinstance(n, int) or n < 1 or n > len(files):
        n = len(files)
    
    files = files[m:n]
    all_embeds = [torch.load(folder + '/' + files[0])]
    for i, t in enumerate(files[1:]):
        all_embeds.append(torch.load(folder + '/' + t))
        if (i + 1) % 200 == 0:
            print((i + 1) * 50)
            
    return all_embeds


embeds = torch.cat(load_embeddings('embeds')).cpu()
embeds.shape

In [None]:
# Добавляем метки кластеров из результатов применения алгоритмов 
chat_df = pd.read_csv('chat_data/chat_history 2024-11-08 v3.csv')
chat_df = chat_df[~chat_df.text.str.startswith('File: <')]
labels = [pd.read_csv('clusters/kmeans_labels1.csv', index_col=0), 
          pd.read_csv('clusters/spkmeans_labels6.csv', index_col=0), 
          pd.read_csv('clusters/gmm_labels3.csv', index_col=0),
          pd.read_csv('clusters/dbscan_labels7.csv', index_col=0), 
          pd.read_csv('clusters/hdbscan_labels1.csv', index_col=0)]

# Таблица двумерной проекции каждого элемента датасета
projections = pd.read_csv('clusters/proj5.csv', index_col=0)
chat_df = pd.concat([chat_df, *labels, projections], axis=1)
chat_df

## k-means

In [None]:
kmeans_labels = pd.read_csv('clusters/kmeans_labels1.csv', index_col=0)
chat_df['kmeans'] = kmeans_labels
# chat_df[['hdbscan']].to_csv('clusters/kmeans_labels1.csv', index=True)

cluster_df = chat_df.groupby('kmeans', as_index=False).text.apply(list)
cluster_df['size'] = cluster_df.text.apply(len)
cluster_df = cluster_df.sort_values('size', ascending=False)
for i, line in cluster_df.iterrows():
    print(f'Кластер {line.kmeans} ({len(line.text)})\n')
    for i, m in enumerate(line.text[:(10 if len(line.text) > 10 else len(line.text))]):
        print(str(i + 1) + '. ' + m[:(100 if len(m) > 100 else len(m))], end='\n')
    
    print('\n')
    

In [None]:
# Вывод слов и их частот для каждого кластера для KMeans
from collections import Counter

import pymorphy2
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer


def to_words(messages):
    twtk = TweetTokenizer(preserve_case=False)
    tokens = []
    for m in messages:
        tokens.extend(twtk.tokenize(m.lower()))
    
    # lowercase_tokens = [m.lower() for m in tokens]
    stop_words = set(stopwords.words('russian'))
    filtered_tokens = [w for w in tokens if w not in stop_words]
    return list(filter(lambda x: x, filtered_tokens))


def make_and_print_clusters(chat_df, labels_col, print_clusters=True):
    cluster_df = chat_df.groupby(labels_col, as_index=False).text.apply(list)
    cluster_df['size'] = cluster_df.text.apply(len)
    cluster_df = cluster_df.sort_values('size', ascending=False).reset_index(drop=True)

    cluster_df.text = cluster_df.text.apply(to_words)
    # Лемматизация
    lemmatizer = pymorphy2.MorphAnalyzer()
    cluster_df.text = cluster_df.text.apply(lambda x: [
        lemmatizer.parse(word)[0].normal_form for word in x])
    # Вычисление количества каждого токена в каждом кластере
    cluster_df['word_counts'] = cluster_df.text.apply(
        lambda wl: sorted([(w, k) for w, k in Counter(wl).items() if k > 1],
                          key=lambda x: x[1], reverse=True))

    if print_clusters:
        # Вывод кластеров
        for _, line in cluster_df.iterrows():
            print(line[labels_col], f'кластер ({line["size"]})\n')

            for i, (w, k) in enumerate(line.word_counts[:14]):
                print(str(i + 1) + '. ' + w[:(100 if len(w) > 100 else len(w))] + f' ({k})', end='\n')

            # for i, m in enumerate(line.text[:(10 if len(line.text) > 10 else len(line.text))]):
            #     print(str(i + 1) + '. ' + m[0][:(100 if len(m[0]) > 100 else len(m[0]))] + f' ({m[1]})', end='\n')

            print('\n')
    
    return cluster_df

In [None]:
cluster_df = make_and_print_clusters(chat_df, 'kmeans')

In [None]:
# Визуализируем облаков слов по кластерам
from wordcloud import WordCloud


def clusters_wordclouds(cluster_df, labels_col, max_words=25, max_font_size=100):
    fig, ax = plt.subplots((cluster_df.shape[0] + 3) // 4, 4)
    fig.set_figwidth(4 * 6)
    fig.set_figheight(((cluster_df.shape[0] + 3) // 4) * 5)
    fig.subplots_adjust(0, 0.05, 1, 0.95)
    for i, line in cluster_df.iterrows():
        text_wordcloud= ' '.join(line.text)
        try:
            wordcloud = WordCloud(width=600, height=450, max_font_size=max_font_size, max_words=max_words, 
                                  colormap='Dark2_r', background_color="white").generate(text_wordcloud)
        except ValueError:
            ax[i // 4, i % 4].axis("off")
            continue

        ax[i // 4, i % 4].imshow(wordcloud)
        ax[i // 4, i % 4].set_title(f'Кластер {line[labels_col]} ({line["size"]})', fontsize=24)
        ax[i // 4, i % 4].axis("off")


    if cluster_df.shape[0] % 4 != 0:
        for i in range(cluster_df.shape[0], ((cluster_df.shape[0] + 3) // 4) * 4):
            ax[i // 4, i % 4].axis("off")

    plt.show()

In [None]:
clusters_wordclouds(cluster_df, 'kmeans')

#### Ручное объединение кластеров

In [None]:
# объединение кластеров в группы
def group_clusters(df, groups, label_col='kmeans'):
    butches = [df[df[label_col].isin(group)] for group in groups]
    butches = [group[label_col].apply(lambda _: i) for i, group in enumerate(butches)]
    return pd.concat(butches).sort_index()

chat_df['new_kmeans'] = group_clusters(chat_df, [
        [41, 3, 6, 19, 24, 20, 30, 1], [17, 14, 13, 12, 11, 5], 
        [2, 35], [39, 25], [31, 27, 23, 29, 18, 37], [7, 9, 22, 26, 28], 
        [10, 15, 40], [36], [21], [4], [33, 8, 38], [0, 16, 32, 34]], 
    'kmeans')
cluster_names = {...}  # Названия кластеров по их номерам в списке объединённых кластеров выше
chat_df['new_kmeans'] = chat_df['new_kmeans'].replace(cluster_names)
chat_df[['kmeans', 'new_kmeans']].to_csv('clusters/kmeans_labels1.csv', index=True)
print(chat_df['new_kmeans'].nunique())

In [None]:
# Вывод сообщений в объединённых кластерах
cluster_df = chat_df.groupby('new_kmeans', as_index=False).text.apply(list)
cluster_df['size'] = cluster_df.text.apply(len)
cluster_df = cluster_df.sort_values('size', ascending=False)
for i, line in cluster_df.iterrows():
    print(f'Кластер {line.new_kmeans} ({len(line.text)})\n')
    for i, m in enumerate(line.text[:(20 if len(line.text) > 20 else len(line.text))]):
        print(str(i + 1) + '. ' + m[:(100 if len(m) > 100 else len(m))], end='\n')
    
    print('\n')

In [None]:
# Вывод слов (токенов) в объединённых кластерах
cluster_df = make_and_print_clusters(chat_df, 'new_kmeans')

In [None]:
# Визуализируем топ слова кластеров после объединения
clusters_wordclouds(cluster_df, 'new_kmeans')

In [None]:
from matplotlib import cm


def visualize_clusters(chat_df, labels_col, title, file: str | None = None):
    # Визуализация
    fig, ax = plt.subplots(2, 2)
    fig.set_figwidth(100)
    fig.set_figheight(100)
    fig.suptitle(title, fontsize=120)

    colors = cm.get_cmap('hsv', len(chat_df[labels_col].unique()))
    for i, proj in enumerate(['pca', 'tsne', 'pca_tsne', 'umap']):
        for j, c in enumerate(chat_df[labels_col].unique()):
            ax[i // 2, i % 2].scatter(chat_df[chat_df[labels_col] == c][proj + 'X'], 
                                      chat_df[chat_df[labels_col] == c][proj + 'Y'], 
                                      color=colors(j), label=f'{c}')

        ax[i // 2, i % 2].set_title(proj, fontsize=100)

    # Получаем элементы легенды из первого подграфика
    handles, labels = ax[0, 0].get_legend_handles_labels()
    fig.legend(handles, labels, loc='upper left', ncol=1, fontsize=75, markerscale=7)
    plt.tight_layout()

    if file is None:
        plt.show()
    else:
        plt.savefig(file)


In [None]:
visualize_clusters(chat_df, 'new_kmeans', 'KMeans', 'clusters/kmeans_proj1.png')

## Сферический k-means

In [None]:
from scipy.sparse import csr_matrix
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from soyclustering import SphericalKMeans


spherical_kmeans = SphericalKMeans(
    n_clusters=144,
    max_iter=10,
    verbose=1,
    init='similar_cut',
    sparsity='minimum_df',
    minimum_df_factor=0.02
)

spkmeans_labels = spherical_kmeans.fit_predict(csr_matrix(embeds.numpy()))
# silhouette_score inertia / n_clusters max_iters
# 0.017295146 7943.604631717489 / 250 20
# 0.026064537 8099.005042571704 / 150 10 
# 0.0282372 7888.141710191523 / 138 10
# 0.02712679 7956.118294731778 / 125 10
# 0.023679893 8267.833774735958 / 100 10
# 0.01999612 8501.070381560368 / 50 10
print(silhouette_score(embeds, spherical_kmeans.labels_), 
      davies_bouldin_score(embeds, spherical_kmeans.labels_), 
      calinski_harabasz_score(embeds, spherical_kmeans.labels_), 
      spherical_kmeans.inertia_)

In [None]:
from soyclustering import merge_close_clusters, visualize_pairwise_distance

# вывод результатов объединения кластеров и графика попарных расстояний между ними
def print_groups(centers, labels, max_dist):
    group_centers, groups = merge_close_clusters(centers, labels, max_dist=max_dist)
    for group in groups:
        print(list(sorted(group)))

    visualize_pairwise_distance(group_centers, max_dist=max_dist, sort=True)
    return group_centers, groups

gc1, gs1 = print_groups(spherical_kmeans.cluster_centers_, spherical_kmeans.labels_, .03)

In [None]:
# добавление результатов кластеризации в датафрейм и вывод кластеров
chat_df['spkmeans'] = spherical_kmeans.labels_
cluster_df = chat_df.groupby('spkmeans', as_index=False).text.apply(list)
cluster_df['size'] = cluster_df.text.apply(len)
cluster_df = cluster_df.sort_values('size', ascending=False)
for i, line in cluster_df.iterrows():
    print(line.spkmeans, f'кластер ({len(line.text)})\n')
    for i, m in enumerate(line.text[:(10 if len(line.text) > 10 else len(line.text))]):
        print(str(i + 1) + '. ' + m[:(100 if len(m) > 100 else len(m))], end='\n')
    
    print('\n')

In [None]:
# объединение кластеров в группы
chat_df['new_spkmeans'] = group_clusters(chat_df, gs1, 'spkmeans')
print(chat_df['new_spkmeans'].nunique())

In [None]:
# вывод кластеров после объединения 
cluster_df = make_and_print_clusters(chat_df, 'new_spkmeans')

In [None]:
# Визуализируем топ слова кластеров
clusters_wordclouds(cluster_df, 'new_spkmeans')

In [None]:
# вывод кластеров после объединения (с выводом значимых слов через TF-IDF)
from collections import Counter

import pymorphy2
from sklearn.feature_extraction.text import TfidfVectorizer


labels_col = 'spkmeans'


def collect_lists(*args):
    all_els = []
    for l in args:
        arg_list = []
        for el in l:
            arg_list.extend(el)
        
        all_els.extend(arg_list)
    
    return all_els


cluster_df = chat_df.groupby(labels_col, as_index=False).text.apply(list)
cluster_df['size'] = cluster_df.text.apply(len)
cluster_df = cluster_df.sort_values('size', ascending=False).reset_index(drop=True)

cluster_df.text = cluster_df.text.apply(to_words)
lemmatizer = pymorphy2.MorphAnalyzer()
cluster_df.text = cluster_df.text.apply(lambda x: [lemmatizer.parse(word)[0].normal_form for word in x])

vectorizer = TfidfVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x, decode_error='ignore', 
                             vocabulary=set(collect_lists(cluster_df.text.to_list())))
tfidf_matrix = vectorizer.fit_transform(cluster_df.text)
fnames = vectorizer.get_feature_names_out()

cluster_df.text = cluster_df.text.apply(lambda wl: sorted([(w, k) for w, k in Counter(wl).items() if k > 1], 
                                                          key=lambda x: x[1], reverse=True))

# cluster_df['size'] = cluster_df.text.apply(len)
# cluster_df = cluster_df.sort_values('size', ascending=False)
for _, line in cluster_df.iterrows():
    print(line[labels_col], f'кластер ({line["size"]})\n')

    tfidf_vector = tfidf_matrix[line[labels_col]].toarray()[0]
    word_tfidf = {fnames[i]: tfidf_vector[i] for i in range(len(tfidf_vector)) 
                  if tfidf_vector[i] > 0 and not fnames[i].isdigit()}
    word_tfidf = sorted(word_tfidf.items(), key=lambda x: x[1], reverse=True)[:10]
    # print([w for w, _ in line.text if not any(w == word for word in fnames)])
    # print([(w, k) for w, _ in word_tfidf for word, k in line.text if word == w])
    word_tfidf = sorted([(w, k) for w, _ in word_tfidf for word, k in line.text if word == w], key=lambda x: x[1], reverse=True)

    # print(word_tfidf)
    # print(line.text)
    for i, (w, k) in enumerate(word_tfidf):
        # frequancy = [k for word, k in line.text if word == w]
        # print(frequancy)
        print(str(i + 1) + '. ' + w[:(100 if len(w) > 100 else len(w))] + f' ({k})', end='\n')

    # for i, m in enumerate(line.text[:(10 if len(line.text) > 10 else len(line.text))]):
    #     print(str(i + 1) + '. ' + m[0][:(100 if len(m[0]) > 100 else len(m[0]))] + f' ({m[1]})', end='\n')

    print('\n')

In [None]:
# вторая группировка
gc2, gs2 = print_groups(gc1, chat_df.new_spkmeans, 0.09)

In [18]:
# сохранение результатов
chat_df[['spkmeans', 'new_spkmeans']].to_csv('clusters/spkmeans_labels6.csv', index=True)

## Визуализация

In [None]:
import umap
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


# Сокращение размерности
pca = PCA(n_components=2, whiten=False)
pca_alt = PCA(n_components=128, whiten=False)
tsne = TSNE(n_components=2, perplexity=50, early_exaggeration=24, metric='cosine', n_iter=2000, n_jobs=-1, verbose=True)
reducer = umap.UMAP(n_neighbors=8, n_components=2, metric='cosine', n_epochs=200, 
                    learning_rate=1., min_dist=0.01, spread=1, low_memory=False, n_jobs=-1, verbose=True)

pca_embeds = pca.fit_transform(embeds)
print('PCA embeddings', pca_embeds.shape)

tsne_embeds = tsne.fit_transform(embeds)
print('t-SNE embeddings', tsne_embeds.shape)

pca_embeds2 = pca_alt.fit_transform(embeds)
pca_tsne_embeds = tsne.fit_transform(pca_embeds2)
print('PCA + t-SNE embeddings', pca_tsne_embeds.shape)

umap_embeds = reducer.fit_transform(embeds)

print('UMAP embeddings', umap_embeds.shape)

In [None]:
import umap
# from sklearn.decomposition import PCA


# pca = PCA(64, whiten=False)
reducer = umap.UMAP(n_neighbors=8, n_components=2, metric='cosine', n_epochs=250, init='pca',
                    learning_rate=1., min_dist=0.01, spread=12, low_memory=False, n_jobs=-1, verbose=True)
umap_embeds = reducer.fit_transform(embeds)
print('UMAP embeddings', umap_embeds.shape)

projs = pd.read_csv('clusters/proj4.csv', index_col=0)
projs['umapX'] = umap_embeds[:, 0]
projs['umapY'] = umap_embeds[:, 1]
projs.to_csv('clusters/proj5.csv', index=True)

In [None]:
chat_df['pcaX'] = pca_embeds[:, 0]
chat_df['pcaY'] = pca_embeds[:, 1]
chat_df['tsneX'] = tsne_embeds[:, 0]
chat_df['tsneY'] = tsne_embeds[:, 1]
chat_df['pca_tsneX'] = pca_tsne_embeds[:, 0]
chat_df['pca_tsneY'] = pca_tsne_embeds[:, 1]
chat_df['umapX'] = umap_embeds[:, 0]
chat_df['umapY'] = umap_embeds[:, 1]
chat_df[['pcaX', 'pcaY', 'tsneX', 'tsneY', 'pca_tsneX', 'pca_tsneY', 'umapX', 'umapY']].to_csv('clusters/proj4.csv', index=True)
chat_df[['pcaX', 'pcaY', 'tsneX', 'tsneY', 'pca_tsneX', 'pca_tsneY', 'umapX', 'umapY']]

In [None]:
visualize_clusters(chat_df, 'new_spkmeans', 'Spherical KMeans', 'clusters/spkmeans_proj4.png')

## Смесь гауссиан

In [None]:
from sklearn.metrics import davies_bouldin_score, silhouette_score, calinski_harabasz_score
from sklearn.mixture import GaussianMixture


gaussmix = GaussianMixture(35, covariance_type='tied', n_init=1, verbose=True, verbose_interval=10)
gaussmix_labels = gaussmix.fit_predict(embeds)
# silhouette davies_bouldin / n_components covariance_type
# 0.014587272 3.9356715807503546 / 5 tied
# -0.0074547827 3.7814189379906167 / 10 spherical
# -0.0031154878 3.9669980278232053 / 20 diag
# 0.017083913 3.893883616755334 / 50 full

print(silhouette_score(embeds, gaussmix_labels), 
      davies_bouldin_score(embeds, gaussmix_labels), 
      calinski_harabasz_score(embeds, gaussmix_labels))

In [None]:
# добавление результатов кластеризации в датафрейм и вывод кластеров
# chat_df['gmm'] = gaussmix_labels
cluster_df = chat_df.groupby('gmm', as_index=False).text.apply(list)
cluster_df['size'] = cluster_df.text.apply(len)
cluster_df = cluster_df.sort_values('size', ascending=False)
for i, line in cluster_df.iterrows():
    print(line.gmm, f'кластер ({len(line.text)})\n')
    for i, m in enumerate(line.text[:(10 if len(line.text) > 10 else len(line.text))]):
        print(str(i + 1) + '. ' + m[:(100 if len(m) > 100 else len(m))], end='\n')
    
    print('\n')

# chat_df[['gmm']].to_csv('clusters/gmm_labels3.csv', index=True)

In [None]:
# вывод кластеров после объединения
cluster_df = make_and_print_clusters(chat_df, 'gmm')

In [None]:
clusters_wordclouds(cluster_df, 'gmm')

In [None]:
visualize_clusters(chat_df, 'gmm', 'Gaussian Mixture Model', 'clusters/gmm_proj3.png')

## DBSCAN

In [None]:
from joblib import Parallel, delayed


def k_mean_dist(idx, points, k):
    distances = []
    for i in range(0, points.shape[0] - 1000, 1000):
        distances.extend(Parallel(n_jobs=-1)(delayed(lambda a, b: np.linalg.norm(a - b, 2))(points[idx], points[j]) 
                                             for j in range(i, i + 1000)))
    
    distances.extend(Parallel(n_jobs=-1)(delayed(lambda a, b: np.linalg.norm(a - b, 2))(points[idx], points[j]) 
                                         for j in range(i, points.shape[0])))
    distances.sort()
    return sum(distances[1:k+1]) / k


mean_distances = []
try:
    for i in range(10000, embeds.shape[0]):
        mean_distances.append(k_mean_dist(i, embeds, 4))
except KeyboardInterrupt:
    pass

mean_distances.sort()
plt.figure(figsize=(12, 6))
plt.plot(np.arange(len(mean_distances)), mean_distances)
plt.ylabel('Среднее расстояние до ближайших 4-х соседей')
plt.savefig('clusters/dbscan_mean_distances.png')

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score


dbscan = DBSCAN(0.4, min_samples=4, metric='euclidean', algorithm='ball_tree', n_jobs=-1)

dbscan_labels = dbscan.fit_predict(embeds)
print('algorithm has converged')
# silhouette davies calinski / eps min_samples
# -0.08605397 1.0740822871715672 30.563701039150505 / 0.1 8
# -0.14635736 1.6975759440481935 38.94185852977294 / 0.3 8
# 0.029522894 1.9106617470272766 50.07569050431296 / 0.435 4
# -0.038184512 1.8158209459582089 42.25380106716254 / 0.4 4
# -0.07685446 1.844907477428522 36.94501188877512 / 0.38 4
print(silhouette_score(embeds, dbscan_labels), 
      davies_bouldin_score(embeds, dbscan_labels), 
      calinski_harabasz_score(embeds, dbscan_labels))

In [None]:
# для уже полученных меток
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

dbscan_labels = pd.read_csv('clusters/dbscan_labels7.csv', index_col=0)
print(silhouette_score(embeds, dbscan_labels), 
      davies_bouldin_score(embeds, dbscan_labels), 
      calinski_harabasz_score(embeds, dbscan_labels))

In [None]:
# добавление результатов кластеризации в датафрейм и вывод кластеров
dbscan_labels = pd.read_csv('clusters/dbscan_labels2.csv', index_col=0)
chat_df['dbscan'] = dbscan_labels
cluster_df = chat_df.groupby('dbscan', as_index=False).text.apply(list)
cluster_df['size'] = cluster_df.text.apply(len)
cluster_df = cluster_df.sort_values('size', ascending=False)
for i, line in cluster_df.iterrows():
    print(f'Кластер {line.dbscan} ({len(line.text)})\n')
    for i, m in enumerate(line.text[:(10 if len(line.text) > 10 else len(line.text))]):
        print(str(i + 1) + '. ' + m[:(100 if len(m) > 100 else len(m))], end='\n')
    
    print('\n')

# chat_df[['dbscan']].to_csv('clusters/dbscan_labels8.csv', index=True)

In [None]:
chat_df.dbscan.nunique()

In [None]:
# вывод кластеров 
make_and_print_clusters(chat_df, 'dbscan')

In [None]:
# Визуализация
visualize_clusters(chat_df, 'dbscan', 'DBSCAN', 'clusters/dbscan_proj7.png')

## HDBSCAN

In [None]:
from sklearn.cluster import HDBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score


hdbscan = HDBSCAN(min_samples=34, xi=0.05, min_cluster_size=0.05, algorithm='ball_tree', n_jobs=-1)
hdbscan_labels = hdbscan.fit_predict(embeds)
print('algorithm has converged')
# 
print(silhouette_score(embeds, hdbscan_labels), 
      davies_bouldin_score(embeds, hdbscan_labels), 
      calinski_harabasz_score(embeds, hdbscan_labels))

In [None]:
hdbscan_labels = pd.read_csv('clusters/hdbscan_labels1.csv', index_col=0)
chat_df['hdbscan'] = hdbscan_labels
# chat_df[['hdbscan']].to_csv('clusters/hdbscan_labels1.csv', index=True)

cluster_df = chat_df.groupby('hdbscan', as_index=False).text.apply(list)
cluster_df['size'] = cluster_df.text.apply(len)
cluster_df = cluster_df.sort_values('size', ascending=False)
for i, line in cluster_df.iterrows():
    print(f'Кластер {line.hdbscan} ({len(line.text)})\n')
    for i, m in enumerate(line.text[:(10 if len(line.text) > 10 else len(line.text))]):
        print(str(i + 1) + '. ' + m[:(100 if len(m) > 100 else len(m))], end='\n')
    
    print('\n')


In [None]:
# Визуализация
visualize_clusters(chat_df, 'hdbscan', 'HDBSCAN', 'clusters/hdbscan_proj4.png')

## Агломеративная кластеризация

In [None]:
aggl_labels = pd.read_csv('clusters/aggl_labels1.csv', index_col=0)
chat_df['aggl'] = aggl_labels
# chat_df[['hdbscan']].to_csv('clusters/hdbscan_labels1.csv', index=True)

cluster_df = chat_df.groupby('aggl', as_index=False).text.apply(list)
cluster_df['size'] = cluster_df.text.apply(len)
cluster_df = cluster_df.sort_values('size', ascending=False)
for i, line in cluster_df.iterrows():
    print(f'Кластер {line.aggl} ({len(line.text)})\n')
    for i, m in enumerate(line.text[:(10 if len(line.text) > 10 else len(line.text))]):
        print(str(i + 1) + '. ' + m[:(100 if len(m) > 100 else len(m))], end='\n')
    
    print('\n')


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster


linked = linkage(embeds[:10000, :], method='ward')
dendrogram(fcluster(linked, 100, criterion='maxclust'), orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.show()