In [73]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, OPTICS, MeanShift ,SpectralClustering
from sklearn_extra.cluster import KMedoids
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import adjusted_rand_score, silhouette_score
from kmodes.kmodes import KModes
import hdbscan
from sklearn.metrics.pairwise import euclidean_distances
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
# pip install kmodes
# pip install quickshift-tsne from quickshift import quickshift
# pip install hdbscan import hdbscan
# 读取数据
file_path = r'E:/FedStream/data_set_syn/Synthetic0310/2Dim_9A1B/forCluster/round_0.csv'
data = pd.read_csv(file_path)

# 只取类1的数据
# k-means
# class_1_data = data[data.iloc[:, -1] == 0]
class_1_data = data[data.iloc[:, -1] == 1]

# 提取特征
X = class_1_data.iloc[:, 0:2].values

# 圆的中心和半径
circle_centers = [(0, 0), (0.25, 0.75), (0.75, 0.25), (-0.25, 0.75), (-0.75, 0.25), (-0.25, -0.75), (-0.75, -0.25), (0.25, -0.75), (0.75, -0.25)]
radii = [0.45] + [0.23] * 8

# 可视化原始数据和聚类原型
def plot_clusters_with_centroids_and_circles(X, labels, centroids, circle_centers, radii, title,save_path):
    fig, ax = plt.subplots()
    # fig.set_size_inches(3.35 / 2.54, 3.5 / 2.54)  # 将尺寸转换为英寸
    ax.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', marker='o', label='Data points')
    ax.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='x', s=50, label='Centroids')

    for center, radius in zip(circle_centers, radii):
        circle = Circle(center, radius, color='blue', fill=False, linewidth=2, linestyle='--')
        ax.add_patch(circle)

    ax.set_aspect('equal', 'box')  # 确保坐标轴比例相同
    ax.set_title(title)
    ax.set_xlabel('Feature 1')
    ax.set_ylabel('Feature 2')
    ax.legend()
    plt.tight_layout()
    plt.savefig(save_path, dpi=300)
    plt.show()

def plotK_Means(cluters_nums):
    # 选择并应用聚类算法（选择K-means为例）
    kmeans = KMeans(n_clusters=cluters_nums, random_state=0).fit(X)
    labels_kmeans = kmeans.labels_
    centroids_kmeans = kmeans.cluster_centers_
    save_path = f'C:/Users/张腾森/Desktop/modelV/cluster/k_means/{cluters_nums}.png'
    plot_clusters_with_centroids_and_circles(X, labels_kmeans, centroids_kmeans, circle_centers, radii, f'K-means , cluster nums : {cluters_nums}',save_path=save_path)

def plotDBSCAN(eps, min_samples):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
    labels_dbscan = dbscan.labels_
    centroids_dbscan = np.array([X[labels_dbscan == i].mean(axis=0) for i in range(len(set(labels_dbscan)) - (1 if -1 in labels_dbscan else 0))])
    save_path = f'C:/Users/张腾森/Desktop/modelV/cluster/DBSCAN/eps_{eps}_min_samples_{min_samples}.png'
    plot_clusters_with_centroids_and_circles(X, labels_dbscan, centroids_dbscan, circle_centers, radii, f'DBSCAN, eps: {eps}, min_samples: {min_samples}', save_path)
def plotHDBSCAN(min_cluster_size):
    hdb = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size).fit(X)
    labels_hdbscan = hdb.labels_
    centroids_hdbscan = np.array([X[labels_hdbscan == i].mean(axis=0) for i in range(len(set(labels_hdbscan)) - (1 if -1 in labels_hdbscan else 0))])
    save_path = f'C:/Users/张腾森/Desktop/modelV/cluster/HDBSCAN/min_cluster_size_{min_cluster_size}.png'
    plot_clusters_with_centroids_and_circles(X, labels_hdbscan, centroids_hdbscan, circle_centers, radii, f'HDBSCAN, min_cluster_size: {min_cluster_size}', save_path)

def plotAgglomerative(cluters_nums):
    agg = AgglomerativeClustering(n_clusters=cluters_nums).fit(X)
    labels_agg = agg.labels_
    # 由于Agglomerative Clustering没有中心点，需要计算每个簇的质心
    centroids_agg = np.array([X[labels_agg == i].mean(axis=0) for i in range(len(set(labels_agg)))])
    save_path = f'C:/Users/张腾森/Desktop/modelV/cluster/Agglomerative/{cluters_nums}.png'
    plot_clusters_with_centroids_and_circles(X, labels_agg, centroids_agg, circle_centers, radii, f'Agglomerative, cluster nums : {cluters_nums}',save_path=save_path)
def plotKModes(clusters_num):
    kmodes = KModes(n_clusters=clusters_num, init='Huang', n_init=5, verbose=1)
    labels_kmodes = kmodes.fit_predict(X)
    centroids_kmodes = kmodes.cluster_centroids_
    print(type(labels_kmodes))
    print(type(centroids_kmodes))
    # save_path = f'C:/Users/张腾森/Desktop/modelV/cluster/KModes/{clusters_num}.png'
    # plot_clusters_with_centroids_and_circles(X, labels_kmodes, centroids_kmodes, circle_centers, radii, f'K-Modes, cluster nums: {clusters_num}', save_path)
def quickshift_clustering(X, tau=0.1, k=10):
    n_samples = X.shape[0]
    nn = NearestNeighbors(n_neighbors=k+1).fit(X)
    distances, indices = nn.kneighbors(X)

    labels = -np.ones(n_samples)
    labels[0] = 0  # Initialize first point as the first cluster
    current_label = 0

    for i in range(1, n_samples):
        min_dist = np.inf
        best_j = -1

        # 遍历当前点i的所有近邻
        for idx in range(1, k+1):  # 跳过自身（索引0是自己）
            j = indices[i][idx]
            dist_to_j = distances[i][idx]

            # 检查距离是否是目前最小，且小于tau
            if dist_to_j < min_dist and dist_to_j < tau:
                min_dist = dist_to_j
                best_j = j

        # 根据最近的有效邻居（距离小于tau）设置簇标签
        if best_j != -1 and labels[best_j] != -1:
            labels[i] = labels[best_j]
        else:
            current_label += 1
            labels[i] = current_label

    return labels

def plotQuickShift(tau,cluster_nums):
    labels_quickshift = quickshift_clustering(X, tau=tau, k=cluster_nums)
    centroids_quickshift = np.array([X[labels_quickshift == i].mean(axis=0) for i in range(len(set(labels_quickshift)) - (1 if -1 in labels_quickshift else 0))])
    save_path = f'C:/Users/张腾森/Desktop/modelV/cluster/QuickShift/tau_{tau}_k_{cluster_nums}.png'
    plot_clusters_with_centroids_and_circles(X, labels_quickshift, centroids_quickshift, circle_centers, radii, f'QuickShift, tau: {tau},cluster:{cluster_nums}', save_path)

def plotOPTICS(min_samples):
    optics = OPTICS(min_samples=min_samples).fit(X)
    labels_optics = optics.labels_
    centroids_optics = np.array([X[labels_optics == i].mean(axis=0) for i in range(len(set(labels_optics)) - (1 if -1 in labels_optics else 0))])
    save_path = f'C:/Users/张腾森/Desktop/modelV/cluster/OPTICS/min_samples_{min_samples}.png'
    plot_clusters_with_centroids_and_circles(X, labels_optics, centroids_optics, circle_centers, radii, f'OPTICS, min_samples: {min_samples}', save_path)

def plotMeanShift(bandwidth):
    mean_shift = MeanShift(bandwidth=bandwidth).fit(X)
    labels_meanshift = mean_shift.labels_
    centroids_meanshift = mean_shift.cluster_centers_
    save_path = f'C:/Users/张腾森/Desktop/modelV/cluster/MeanShift/meanshift_bandwidth_{bandwidth}.png'
    plot_clusters_with_centroids_and_circles(X, labels_meanshift, centroids_meanshift, circle_centers, radii, f'Mean Shift (Bandwidth={bandwidth})', save_path)

def plotGaussianMixture(clusters_num):
    # 选择并应用高斯混合模型
    gmm = GaussianMixture(n_components=clusters_num, random_state=0).fit(X)
    labels_gmm = gmm.predict(X)
    centroids_gmm = gmm.means_  # 高斯混合模型的质心是每个成分的均值
    save_path = f'C:/Users/张腾森/Desktop/modelV/cluster/GaussianMixture/{clusters_num}.png'
    plot_clusters_with_centroids_and_circles(X, labels_gmm, centroids_gmm, circle_centers, radii, f'Gaussian Mixture, cluster nums: {clusters_num}', save_path)

def plotSpectralClustering(clusters_num):
    spectral = SpectralClustering(n_clusters=clusters_num, random_state=0, affinity='nearest_neighbors').fit(X)
    labels_spectral = spectral.labels_
    # 由于 Spectral Clustering 没有质心，我们计算每个簇的质心
    centroids_spectral = np.array([X[labels_spectral == i].mean(axis=0) for i in range(clusters_num)])
    save_path = f'C:/Users/张腾森/Desktop/modelV/cluster/SpectralClustering/{clusters_num}.png'
    plot_clusters_with_centroids_and_circles(X, labels_spectral, centroids_spectral, circle_centers, radii, f'Spectral Clustering, cluster nums: {clusters_num}', save_path)


def plotKMedoids(clusters_num):
    kmedoids = KMedoids(n_clusters=clusters_num, random_state=0).fit(X)
    labels_kmedoids = kmedoids.labels_
    centroids_kmedoids = kmedoids.cluster_centers_
    save_path = f'C:/Users/张腾森/Desktop/modelV/cluster/KMedoids/{clusters_num}.png'
    plot_clusters_with_centroids_and_circles(X, labels_kmedoids, centroids_kmedoids, circle_centers, radii, f'K-Medoids, cluster nums: {clusters_num}', save_path)

def plotKMeansPlusPlus(clusters_num):
    kmeans_plus_plus = KMeans(n_clusters=clusters_num, init='k-means++', random_state=0).fit(X)
    labels_kmeans_plus_plus = kmeans_plus_plus.labels_
    centroids_kmeans_plus_plus = kmeans_plus_plus.cluster_centers_
    save_path = f'C:/Users/张腾森/Desktop/modelV/cluster/KMeansPlusPlus/{clusters_num}.png'
    plot_clusters_with_centroids_and_circles(X, labels_kmeans_plus_plus, centroids_kmeans_plus_plus, circle_centers, radii, f'K-Means++, cluster nums: {clusters_num}', save_path)

# 示例 Density Peaks Clustering（DPC）
# 注意：这里假设你已经有实现 DPC 算法的代码
# from sklearn.metrics.pairwise import euclidean_distances
def plotDensityPeaksClustering(clusters_num):
    # 计算欧氏距离矩阵
    distances = euclidean_distances(X, X)
    # 计算每个点的局部密度
    dc = np.percentile(distances, 2)  # 设置截断距离 dc
    rho = np.sum(np.exp(-(distances / dc) ** 2), axis=1) - 1
    # 计算每个点与比它密度更大的点的最小距离
    delta = np.zeros_like(rho)
    for i in range(len(rho)):
        greater_density = np.where(rho > rho[i])[0]
        if len(greater_density) > 0:
            delta[i] = np.min(distances[i, greater_density])
        else:
            delta[i] = np.max(distances[i])
    # # 确定簇中心
    cluster_centers = np.argsort(-rho * delta)[:clusters_num]  # 假设clusters_num个簇
    labels = np.zeros(len(X), dtype=int)
    # 分配每个点到最近的簇中心
    for i in range(len(X)):
        if i in cluster_centers:
            labels[i] = np.where(cluster_centers == i)[0][0]
        else:
            labels[i] = labels[np.argmin(distances[i, cluster_centers])]

    labels_dpc = labels
    centroids_dpc = X[cluster_centers]  # 使用索引获取簇中心的坐标
    save_path = f'C:/Users/张腾森/Desktop/modelV/cluster/DensityPeaksClustering/{clusters_num}.png'
    plot_clusters_with_centroids_and_circles(X, labels_dpc, centroids_dpc, circle_centers, radii, f'Density Peaks Clustering, cluster nums: {clusters_num}', save_path)
# k-means,Agglomerative
cluster_list = [1,3,5,10,15,20,25,30]
# 如需使用其他算法，请解注释以下部分并注释掉相应的K-means部分
for cluster_nums in cluster_list:
    # plotK_Means(cluters_nums=cluster_nums)
    # plotAgglomerative(cluters_nums=cluster_nums)
    # plotKMedoids(clusters_num=cluster_nums)
    # plotKMeansPlusPlus(clusters_num=cluster_nums)
    # plotDensityPeaksClustering(clusters_num=cluster_nums)
    # plotGaussianMixture(clusters_num =cluster_nums )
    # plotSpectralClustering(clusters_num =cluster_nums)
    plotKModes(clusters_num =cluster_nums)

# 示例 DBSCAN 参数组合
# 定于域[-1,1],[-1,1]
# eps_list = [0.1, 0.15, 0.2, 0.3]
# min_samples_list = [2, 3, 5, 10, 15, 20, 25]
# for min_samples in min_samples_list:
#     for eps in eps_list:
#         # plotDBSCAN(eps=eps, min_samples=min_samples)
#         pass  # Since DBSCAN is commented out, this will not execute
#     plotOPTICS(min_samples=min_samples)
#     plotHDBSCAN(min_cluster_size=min_samples)
# bandwidth_list = [0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.6]
# for bandwidth in bandwidth_list:
#     plotMeanShift(bandwidth)
# tau_list =[0.1,0.5,1,2]
# clusterL = [3]
# for tau in tau_list:
#     for c in clusterL:
#         plotQuickShift(tau=tau,cluster_nums=c)
# 1. K-means 聚类 kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
# 2. 层次聚类 (Hierarchical Clustering),hierarchical = AgglomerativeClustering(n_clusters=3).fit(X)
# 3. DBSCAN (Density-Based Spatial Clustering of Applications with Noise),dbscan = DBSCAN(eps=0.1, min_samples=5).fit(X
# 4. Mean Shift 聚类 from sklearn.cluster import MeanShift,# Mean Shift 聚类,meanshift = MeanShift().fit(X),labels = meanshift.labels_,centers = meanshift.cluster_centers_
# 5. 高斯混合模型 (Gaussian Mixture Model, GMM),from sklearn.mixture import GaussianMixture,gmm = GaussianMixture(n_components=3, random_state=0).fit(X),labels = gmm.predict(X)
# 6. 评估自组织映射 (Self-Organizing Map, SOM),from minisom import MiniSom,som = MiniSom(x=10, y=10, input_len=3, sigma=0.3, learning_rate=0.5),som.random_weights_init(X),som.train_random(X, 100),labels = np.array([som.winner(x) for x in X])
# 7. Spectral Clustering,from sklearn.cluster import SpectralClustering,spectral = SpectralClustering(n_clusters=3, affinity='nearest_neighbors').fit(X),labels = spectral.labels_
# 8. 期望最大化聚类 (Expectation-Maximization Clustering),from sklearn.mixture import GaussianMixture,em = GaussianMixture(n_components=3, random_state=0).fit(X)labels = em.predict(X)

# 基于密度的
# 1. DBSCAN (Density-Based Spatial Clustering of Applications with Noise)，通过识别密度高的区域来形成簇。
# 2. OPTICS (Ordering Points To Identify the Clustering Structure)，与DBSCAN类似，但能够识别不同密度的簇。
    # from sklearn.cluster import OPTICS，optics = OPTICS(min_samples=5).fit(X)，labels = optics.labels_
# 3. HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise),是DBSCAN的改进版本，能够自动确定簇的数量，并且更好地处理不同密度的簇。
    # pip install hdbscan，import hdbscan，hdb = hdbscan.HDBSCAN(min_cluster_size=5).fit(X)，labels = hdb.labels_
# 4. Quickshift 通过找到数据点之间的密度峰值来形成簇。
    # pip install quickshift-tsne,from quickshift import quickshift,bandwidth = 0.1  # 设定带宽参数,cluster_ids = quickshift(X, kernel='gaussian', bandwidth=bandwidth)

# 基于峰值
# 1.Mean Shift,通过将数据点向密度最大的位置（均值）移动来形成簇。
# 2.K-medoids 聚类 (Partitioning Around Medoids PAM) 通过选择实际数据点作为簇中心来进行聚类。它对于处理噪声和异常值更加鲁棒。
    # pip install scikit-learn scikit-learn-extra,from sklearn_extra.cluster import KMedoids,kmedoids = KMedoids(n_clusters=3, random_state=0).fit(X).,labels = kmedoids.labels_,medoids = kmedoids.cluster_centers_
# 3 K-modes, K-means 的变体，专门用于处理分类数据（即离散数据）。与 K-means 使用均值作为质心不同，K-modes 使用众数作为质心。
# 4. K-means++ ,主要解决了初始簇中心选择的问题，通过更智能的初始化方法提高了聚类效果。
# 5. Density Peaks Clustering,首先计算每个点的局部密度，然后根据局部密度的峰值来形成簇。
    # from sklearn.metrics.pairwise import euclidean_distances
    #AgglomerativeDBSCANDensityPeaksClusteringGaussianMixturek_meansKMeansPlusPlusKMedoidsMeanShiftOPTICSSpectralClustering

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 598.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 0, cost: 598.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 0, cost: 598.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 0, cost: 598.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 0, cost: 598.0
Best run was number 1
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 594.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 0, cost: 594.0
Init: initializing centroids
Init: initia