In [1]:
import numpy as np
import pandas as pd
import warnings

# Отключение всех предупреждений
warnings.filterwarnings("ignore")

df = pd.read_csv('train.csv')
df

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61873,61874,1,0,0,1,1,0,0,0,0,...,1,0,0,0,0,0,0,2,0,Class_9
61874,61875,4,0,0,0,0,0,0,0,0,...,0,2,0,0,2,0,0,1,0,Class_9
61875,61876,0,0,0,0,0,0,0,3,1,...,0,3,1,0,0,0,0,0,0,Class_9
61876,61877,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,3,10,0,Class_9


In [2]:
from sklearn.model_selection import train_test_split

_, X, _, y = train_test_split(df.drop(columns=['id', 'target']), df.target, test_size=0.01,
                              stratify=df.target, random_state=0)

In [3]:
y = y.reset_index(drop=True).apply(lambda label: int(label[-1]) - 1)
y

0      3
1      5
2      7
3      2
4      1
      ..
614    8
615    7
616    1
617    4
618    7
Name: target, Length: 619, dtype: int64

In [4]:
def get_consensus_clustering(N, all_partitions, criterion='av'):

    # Создаем консенсусную матрицу
    consensus_matrix = np.zeros((N, N))

    for i in range(N):
        for j in range(N):
            count = sum(all_partitions[k][i] == all_partitions[k][j] for k in range(len(all_partitions)))
            consensus_matrix[i, j] = count

    # Проводим modularity-shift
    row_sums = consensus_matrix.sum(0)
    overall_sum = consensus_matrix.sum()
    for i in range(N):
        for j in range(N):
            consensus_matrix[i,j] -= row_sums[i] * row_sums[j] / overall_sum

    # Инициализация разбиения
    clusters = [[i] for i in range(len(consensus_matrix))]
    max_delta = 1
    while len(clusters) > 2 and max_delta > 0:
        max_delta = 0
        best_s, best_t = 0, 0
        for s in range(len(consensus_matrix)):
            for t in range(s + 1, len(consensus_matrix)):
                cur_delta = consensus_matrix[s,t]
                if criterion == 'av':
                    # Т.к. предполагается зануление диагонали матрицы,
                    # мы не учитываем диагональные элементы при вычислении критерия
                    cur_delta /= (len(clusters[s]) + len(clusters[t]))
                if cur_delta > max_delta:
                    max_delta = cur_delta
                    best_s, best_t = s, t

        if max_delta > 0:
            clusters[best_s] += clusters[best_t]
            del clusters[best_t]

            consensus_matrix[best_s, :] += consensus_matrix[best_t, :]
            consensus_matrix[:, best_s] += consensus_matrix[:, best_t]
            consensus_matrix = np.delete(consensus_matrix, best_t, axis=0)  # Удаление строки best_t
            consensus_matrix = np.delete(consensus_matrix, best_t, axis=1)  # Удаление столбца best_t

    cluster_labels = np.empty(N, dtype=int)

    for cluster_idx, cluster in enumerate(clusters):
        cluster_labels[cluster] = cluster_idx
    
    return cluster_labels

In [5]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import adjusted_rand_score

# Создаем пустой список для хранения результатов
results = []

N = len(X)
for n_clusters in [4, 8]:
    for km in [KMeans(n_clusters=n_clusters, random_state=0),
                   MiniBatchKMeans(n_clusters=n_clusters, random_state=0)]:
        
        for n_features in [60, 80]:
            for M in [10, 40]:
            
                np.random.seed(0)
                ground_truth_labels = km.fit_predict(X)
                all_partitions = []
                
                for i in range(M):
                    feature_indices = np.random.choice(X.shape[1], size=n_features, replace=False)
                    all_partitions.append(km.fit_predict(X.iloc[:, feature_indices]))
                
                for criterion in ['sum', 'av']:
                    pred_labels = get_consensus_clustering(N, all_partitions, criterion)
                    
                    cons_ari = adjusted_rand_score(ground_truth_labels, pred_labels)
                    mean_ari = np.mean([adjusted_rand_score(ground_truth_labels, all_partitions[i]) for i in range(M)])
                    delta = cons_ari - mean_ari
                    
                    # Добавляем результаты в список
                    results.append([n_clusters, km, n_features, M, criterion, cons_ari, mean_ari, delta, len(np.unique(pred_labels))])

# Создаем DataFrame из списка результатов
columns = ['Число кластеров в истинном разбиении', 'Алгоритм', 'Число признаков', 'Число входных разбиений', 'Критерий', 'ARI консенсусного разбиения', 'Средний ARI ансамбля входных разбиений', 'Разница', 'Число кластеров в консенсусном разбиении']
result_df = pd.DataFrame(results, columns=columns)

# Выводим DataFrame
result_df

Unnamed: 0,Число кластеров в истинном разбиении,Алгоритм,Число признаков,Число входных разбиений,Критерий,ARI консенсусного разбиения,Средний ARI ансамбля входных разбиений,Разница,Число кластеров в консенсусном разбиении
0,4,"KMeans(n_clusters=4, random_state=0)",60,10,sum,0.77572,0.470719,0.305001,3
1,4,"KMeans(n_clusters=4, random_state=0)",60,10,av,0.77466,0.470719,0.303941,3
2,4,"KMeans(n_clusters=4, random_state=0)",60,40,sum,0.859954,0.521495,0.338459,3
3,4,"KMeans(n_clusters=4, random_state=0)",60,40,av,0.841988,0.521495,0.320492,3
4,4,"KMeans(n_clusters=4, random_state=0)",80,10,sum,0.915531,0.721345,0.194186,3
5,4,"KMeans(n_clusters=4, random_state=0)",80,10,av,0.933776,0.721345,0.212431,3
6,4,"KMeans(n_clusters=4, random_state=0)",80,40,sum,0.940582,0.748369,0.192213,3
7,4,"KMeans(n_clusters=4, random_state=0)",80,40,av,0.951459,0.748369,0.20309,3
8,4,"MiniBatchKMeans(n_clusters=4, random_state=0)",60,10,sum,0.326939,0.307712,0.019228,4
9,4,"MiniBatchKMeans(n_clusters=4, random_state=0)",60,10,av,0.330501,0.307712,0.022789,4
