In [None]:
import json
from sklearn.metrics.pairwise import cosine_distances
from sentence_transformers import SentenceTransformer
import hdbscan
import numpy as np
import torch

# 确保 GPU 可用
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# 读取 JSON 文件
with open('improved_similarity_analysis_results.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# 加载模型并设置为 GPU
model = SentenceTransformer('all-mpnet-base-v2', device=device)

# 全局统计变量：Noise 总数量和总频率
total_noise_count = 0
total_noise_frequency = 0

# 用于存储所有的 Noise 数据
all_noise_details = []

# 存储每个 group 的结果
group_results = {}

# 遍历每一个 group 并进行聚类
for group_name, group_data in data['groups'].items():
    # 初始化临时字典，用于频率累加
    type_dict = {}

    # 提取当前 group 下的 types.name 和 types.frequency 值
    if 'types' in group_data:
        for type_item in group_data['types']:
            if 'name' in type_item and 'frequency' in type_item:
                # 去掉 "-" 的操作
                cleaned_name = type_item['name'].replace('-', '')
                frequency = type_item['frequency']
                
                # 累加频率
                if cleaned_name in type_dict:
                    type_dict[cleaned_name] += frequency
                else:
                    type_dict[cleaned_name] = frequency

    # 如果没有有效的 type_name，跳过该 group
    if not type_dict:
        print(f"Group '{group_name}' has no types to cluster.")
        continue

    # 将字典内容拆分为列表
    type_names = list(type_dict.keys())
    type_frequencies = list(type_dict.values())
    type_details = [(name, freq) for name, freq in type_dict.items()]

    # 打印当前 group 的名称
    print(f"\nProcessing group: {group_name}")

    # 计算嵌入并禁用进度条
    embeddings = model.encode(type_names, batch_size=32, show_progress_bar=False, convert_to_numpy=True)

    # 生成余弦距离矩阵
    distance_matrix = cosine_distances(embeddings)
    distance_matrix = distance_matrix.astype(np.float64)

    # 使用 HDBSCAN 聚类
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=2,  # 可以根据需求调整
        min_samples=1,       # 更宽松
        cluster_selection_epsilon=0,
        metric='precomputed'
    )
    cluster_labels = clusterer.fit_predict(distance_matrix)

    # 收集聚类结果
    clusters = {}
    for i, lbl in enumerate(cluster_labels):
        if lbl not in clusters:
            clusters[lbl] = []
        clusters[lbl].append(type_details[i])  # 保存 (name, frequency)

    # 初始化 group 的结果存储
    group_results[group_name] = {
        "clusters": [],
        "noise": []
    }

    # 打印当前 group 的聚类结果
    for cid, items in clusters.items():
        if cid == -1:  # Noise
            group_noise_count = len(items)  # 当前 group 的噪声点数量
            group_noise_frequency = sum(freq for _, freq in items)  # 当前 group 的噪声点总频率
            total_noise_count += group_noise_count  # 累加到全局噪声点数量
            total_noise_frequency += group_noise_frequency  # 累加到全局噪声点总频率
            all_noise_details.extend(items)  # 收集 Noise 数据

            # 保存到当前 group 的 noise 结果
            group_results[group_name]["noise"] = {
                "count": group_noise_count,
                "total_frequency": group_noise_frequency,
                "details": items
            }

            print(f"Noise (Total Frequency: {group_noise_frequency}, {group_noise_count} items):")
            for name, freq in items:
                print(f"  {name} (frequency: {freq})")
        else:  # Cluster
            cluster_total_frequency = sum(freq for _, freq in items)  # 计算总频率

            # 保存到当前 group 的 cluster 结果
            group_results[group_name]["clusters"].append({
                "cluster_id": cid,
                "total_frequency": cluster_total_frequency,
                "types": items
            })

            print(f"Cluster {cid} (Total Frequency: {cluster_total_frequency}):")
            for name, freq in items:
                print(f"  {name} (frequency: {freq})")

# 打印全局 Noise 统计信息
print("\n=== Global Noise Summary ===")
print(f"Total Noise Points: {total_noise_count}")
print(f"Total Noise Frequency: {total_noise_frequency}")
print("All Noise Details:")
for name, freq in all_noise_details:
    print(f"  {name} (frequency: {freq})")

# 定义递归转换函数
def convert_to_native(obj):
    if isinstance(obj, dict):
        return {key: convert_to_native(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_native(item) for item in obj]
    elif isinstance(obj, np.integer):  # 检查 numpy 整数类型
        return int(obj)
    elif isinstance(obj, np.floating):  # 检查 numpy 浮点类型
        return float(obj)
    elif isinstance(obj, np.ndarray):  # 检查 numpy 数组
        return obj.tolist()
    else:
        return obj

# 转换 group_results 中的所有数据
group_results_native = int(group_results)

# 保存到 JSON 文件
output_file = "group_clustering_results.json"
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(group_results_native, file, indent=4, ensure_ascii=False)

print(f"\nGroup clustering results saved to '{output_file}'")



Using device: cuda





Processing group: object image
Cluster 318 (Total Frequency: 157):
  object image (frequency: 139)
  general image (frequency: 12)
  common object image (frequency: 1)
  object mesh image (frequency: 1)
  object instance image (frequency: 1)
  co occurring object image (frequency: 1)
  object model image (frequency: 1)
  general object image (frequency: 1)
Cluster 399 (Total Frequency: 160):
  rgb image (frequency: 106)
  rgbd image (frequency: 50)
  srgb image (frequency: 3)
  rgb color image (frequency: 1)
Cluster 410 (Total Frequency: 123):
  face image (frequency: 105)
  facial image (frequency: 14)
  human face image (frequency: 1)
  real face image (frequency: 1)
  visual face image (frequency: 1)
  face features image (frequency: 1)
Noise (Total Frequency: 887, 580 items):
  scene image (frequency: 78)
  pedestrian image (frequency: 25)
  color image (frequency: 15)
  remote sensing image (frequency: 15)
  object tracking sequence (frequency: 14)
  scene text image (frequency: 

In [18]:
import json
from sklearn.metrics.pairwise import cosine_distances
from sentence_transformers import SentenceTransformer
import hdbscan
import numpy as np
import torch

# 确保 GPU 可用
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# 读取 JSON 文件
with open('improved_similarity_analysis_results.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# 加载模型并设置为 GPU
model = SentenceTransformer('all-mpnet-base-v2', device=device)

# 全局结果存储
final_results = {}

# 遍历每一个 group 并处理
for group_name, group_data in data['groups'].items():
    # 初始化临时字典，用于频率累加
    type_dict = {}

    # 提取 types.name 和 types.frequency
    if 'types' in group_data:
        for type_item in group_data['types']:
            if 'name' in type_item and 'frequency' in type_item:
                cleaned_name = type_item['name'].replace('-', '')
                frequency = type_item['frequency']
                if cleaned_name in type_dict:
                    type_dict[cleaned_name] += frequency
                else:
                    type_dict[cleaned_name] = frequency

    # 如果当前 group 没有有效数据，跳过
    if not type_dict:
        print(f"Group '{group_name}' has no types to cluster.")
        final_results[group_name] = {
            "clusters": [],
            "secondary_clusters": [],
            "remaining_noise": {
                "total_frequency": 0,
                "types": []
            }
        }
        continue

    # 准备数据
    type_names = list(type_dict.keys())
    type_frequencies = list(type_dict.values())
    type_details = [(name, freq) for name, freq in type_dict.items()]

    # 计算嵌入
    embeddings = model.encode(type_names, batch_size=32, show_progress_bar=False, convert_to_numpy=True)

    # 生成余弦距离矩阵
    distance_matrix = cosine_distances(embeddings).astype(np.float64)

    # 使用 HDBSCAN 聚类
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=2,
        min_samples=1,
        cluster_selection_epsilon=0.1,
        metric='precomputed'
    )
    cluster_labels = clusterer.fit_predict(distance_matrix)

    # 收集第一次聚类结果
    clusters = {}
    for i, lbl in enumerate(cluster_labels):
        if lbl not in clusters:
            clusters[lbl] = []
        clusters[lbl].append(type_details[i])

    # 初始化存储
    secondary_clusters = []
    remaining_noise = []

    # 检查是否存在有效的 clusters
    if all(cid == -1 for cid in clusters):
        print(f"Group '{group_name}' has no valid clusters, all points are noise.")
        sorted_clusters = []
        noise = clusters.get(-1, type_details)  # 如果所有点都是噪声
    else:
        # 按簇频率排序并重新编号
        sorted_clusters = sorted(
            ((cid, items) for cid, items in clusters.items() if cid != -1),
            key=lambda x: -sum(freq for _, freq in x[1])
        )
        cluster_id_mapping = {old_id: new_id for new_id, (old_id, _) in enumerate(sorted_clusters)}
        sorted_clusters = [
            {
                "cluster_id": cluster_id_mapping[cid],
                "total_frequency": sum(freq for _, freq in items),
                "types": items
            }
            for cid, items in sorted_clusters
        ]
        # 获取噪声点
        noise = clusters.get(-1, [])

    # 处理噪声点（二次聚类或直接记录）
    if noise:
        print(f"\nRe-clustering noise for group: {group_name}")

        noise_names = [name for name, _ in noise]
        noise_frequencies = [freq for _, freq in noise]

        # 如果只有一个噪声点，直接作为剩余噪声处理
        if len(noise_names) == 1:
            print(f"Only one noise point in group '{group_name}', skipping re-clustering.")
            remaining_noise.append((noise_names[0], noise_frequencies[0]))
        elif len(noise_names) > 1:
            # 计算嵌入
            noise_embeddings = model.encode(noise_names, batch_size=32, show_progress_bar=False, convert_to_numpy=True)

            if noise_embeddings.shape[0] > 0:
                # 生成余弦距离矩阵
                noise_distance_matrix = cosine_distances(noise_embeddings).astype(np.float64)

                # 使用 HDBSCAN 进行二次聚类
                secondary_clusterer = hdbscan.HDBSCAN(
                    min_cluster_size=2,
                    min_samples=1,
                    cluster_selection_epsilon=0.2,
                    metric='precomputed'
                )
                secondary_labels = secondary_clusterer.fit_predict(noise_distance_matrix)

                # 偏移量确保编号不冲突
                offset = len(sorted_clusters)

                # 分类二次聚类结果并按频率排序
                secondary_clusters_raw = {}
                for i, lbl in enumerate(secondary_labels):
                    new_lbl = lbl + offset if lbl != -1 else -1
                    if new_lbl not in secondary_clusters_raw:
                        secondary_clusters_raw[new_lbl] = []
                    secondary_clusters_raw[new_lbl].append((noise_names[i], noise_frequencies[i]))

                sorted_secondary_clusters = sorted(
                    ((cid, items) for cid, items in secondary_clusters_raw.items() if cid != -1),
                    key=lambda x: -sum(freq for _, freq in x[1])
                )
                secondary_cluster_id_mapping = {old_id: new_id for new_id, (old_id, _) in enumerate(sorted_secondary_clusters)}

                for cid, items in sorted_secondary_clusters:
                    secondary_clusters.append({
                        "cluster_id": secondary_cluster_id_mapping[cid],
                        "total_frequency": sum(freq for _, freq in items),
                        "types": items
                    })

                # 将仍然是噪声的点添加到 remaining_noise
                remaining_noise.extend(secondary_clusters_raw.get(-1, []))
            else:
                print(f"Embedding failed or no valid noise data for group: {group_name}.")
        else:
            print(f"No noise points to re-cluster for group: {group_name}.")

    # 将结果保存到最终输出
    final_results[group_name] = {
        "clusters": sorted_clusters,
        "secondary_clusters": secondary_clusters,
        "remaining_noise": {
            "total_frequency": sum(freq for _, freq in remaining_noise),
            "types": remaining_noise
        }
    }

# 保存最终结果到 JSON 文件
def convert_numpy(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

output_file = "final_clustering_results.json"
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(final_results, file, indent=4, ensure_ascii=False, default=convert_numpy)

print(f"\nFinal clustering results saved to '{output_file}'")


Using device: cuda





Re-clustering noise for group: object image

Re-clustering noise for group: dataset description

Re-clustering noise for group: annotation

Re-clustering noise for group: imu data

Re-clustering noise for group: sentence

Re-clustering noise for group: question

Re-clustering noise for group: egocentric video

Re-clustering noise for group: depth map

Re-clustering noise for group: label

Re-clustering noise for group: speech recording

Re-clustering noise for group: social media post

Re-clustering noise for group: multilingual text

Re-clustering noise for group: 3d point cloud

Re-clustering noise for group: news article

Re-clustering noise for group: indoor scene

Re-clustering noise for group: metadata

Re-clustering noise for group: demographic information

Re-clustering noise for group: research paper

Re-clustering noise for group: action sequence

Re-clustering noise for group: segmentation mask

Re-clustering noise for group: caption

Re-clustering noise for group: annotate

In [20]:
from keybert import KeyBERT

# 加载 KeyBERT 模型
kw_model = KeyBERT(model)

# 读取 JSON 文件
input_file = "final_clustering_results.json"
with open(input_file, 'r', encoding='utf-8') as file:
    data = json.load(file)

# 遍历每个 group 并提取关键词
for group_name, group_data in data.items():
    # 处理 clusters
    if "clusters" in group_data:
        for cluster in group_data["clusters"]:
            # 将簇中的所有类型名称拼接成一个文本
            cluster_text = " ".join([t for t, _ in cluster["types"]])
            # 使用 KeyBERT 提取关键词
            keywords = kw_model.extract_keywords(cluster_text, keyphrase_ngram_range=(1, 2), top_n=3)
            # 将提取的关键词存储到簇中
            cluster["keywords"] = [kw for kw, _ in keywords]

    # 处理 secondary_clusters
    if "secondary_clusters" in group_data:
        for secondary_cluster in group_data["secondary_clusters"]:
            # 将簇中的所有类型名称拼接成一个文本
            cluster_text = " ".join([t for t, _ in secondary_cluster["types"]])
            # 使用 KeyBERT 提取关键词
            keywords = kw_model.extract_keywords(cluster_text, keyphrase_ngram_range=(1, 2), top_n=3)
            # 将提取的关键词存储到簇中
            secondary_cluster["keywords"] = [kw for kw, _ in keywords]

# 保存更新后的 JSON 文件
output_file = "final_clustering_results_with_keywords.json"
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(data, file, indent=4, ensure_ascii=False)

print(f"\nUpdated JSON with keywords saved to '{output_file}'")



Updated JSON with keywords saved to 'final_clustering_results_with_keywords.json'


In [1]:
import json
from sklearn.metrics.pairwise import cosine_distances
from sentence_transformers import SentenceTransformer
import hdbscan
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json

# Load the JSON data
with open('final_clustering_results_with_keywords.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Function to count clusters, types, and frequencies
def count_clusters_types_frequency(group):
    clusters_count = len(group.get('clusters', []))
    types_count = sum(len(cluster.get('types', [])) for cluster in group.get('clusters', []))
    total_frequency = sum(cluster.get('total_frequency', 0) for cluster in group.get('clusters', []))
    
    secondary_clusters_count = len(group.get('secondary_clusters', []))
    secondary_types_count = sum(len(cluster.get('types', [])) for cluster in group.get('secondary_clusters', []))
    secondary_total_frequency = sum(cluster.get('total_frequency', 0) for cluster in group.get('secondary_clusters', []))
    
    remaining_noise_count = len(group.get('remaining_noise', {}).get('types', []))
    remaining_noise_frequency = group.get('remaining_noise', {}).get('total_frequency', 0)
    
    return {
        'clusters_count': clusters_count,
        'types_count': types_count,
        'total_frequency': total_frequency,
        'secondary_clusters_count': secondary_clusters_count,
        'secondary_types_count': secondary_types_count,
        'secondary_total_frequency': secondary_total_frequency,
        'remaining_noise_count': remaining_noise_count,
        'remaining_noise_frequency': remaining_noise_frequency
    }

# Iterate through each group and print the statistics
for group_name, group_data in data.items():
    stats = count_clusters_types_frequency(group_data)
    print(f"Group: {group_name}")
    print(f"Clusters count: {stats['clusters_count']}")
    print(f"Types count: {stats['types_count']}")
    print(f"Total frequency: {stats['total_frequency']}")
    print(f"Secondary clusters count: {stats['secondary_clusters_count']}")
    print(f"Secondary types count: {stats['secondary_types_count']}")
    print(f"Secondary total frequency: {stats['secondary_total_frequency']}")
    print(f"Remaining noise count: {stats['remaining_noise_count']}")
    print(f"Remaining noise frequency: {stats['remaining_noise_frequency']}")
    print()

Group: object image
Clusters count: 430
Types count: 1340
Total frequency: 3242
Secondary clusters count: 112
Secondary types count: 327
Secondary total frequency: 598
Remaining noise count: 251
Remaining noise frequency: 285

Group: dataset description
Clusters count: 69
Types count: 195
Total frequency: 1145
Secondary clusters count: 2
Secondary types count: 132
Secondary total frequency: 272
Remaining noise count: 7
Remaining noise frequency: 7

Group: annotation
Clusters count: 118
Types count: 339
Total frequency: 973
Secondary clusters count: 26
Secondary types count: 74
Secondary total frequency: 102
Remaining noise count: 148
Remaining noise frequency: 198

Group: imu data
Clusters count: 126
Types count: 351
Total frequency: 640
Secondary clusters count: 41
Secondary types count: 139
Secondary total frequency: 172
Remaining noise count: 103
Remaining noise frequency: 115

Group: sentence
Clusters count: 84
Types count: 217
Total frequency: 590
Secondary clusters count: 14
Seco

In [3]:
import json

# Load the JSON data
with open('final_clustering_results_with_keywords.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Initialize counters
total_clusters_count = 0
total_types_count = 0
total_frequency = 0
total_secondary_clusters_count = 0
total_secondary_types_count = 0
total_secondary_frequency = 0
total_remaining_noise_count = 0
total_remaining_noise_frequency = 0

# Function to count clusters, types, and frequencies
def count_clusters_types_frequency(group):
    global total_clusters_count, total_types_count, total_frequency
    global total_secondary_clusters_count, total_secondary_types_count, total_secondary_frequency
    global total_remaining_noise_count, total_remaining_noise_frequency
    
    clusters_count = len(group.get('clusters', []))
    types_count = sum(len(cluster.get('types', [])) for cluster in group.get('clusters', []))
    frequency = sum(cluster.get('total_frequency', 0) for cluster in group.get('clusters', []))
    
    secondary_clusters_count = len(group.get('secondary_clusters', []))
    secondary_types_count = sum(len(cluster.get('types', [])) for cluster in group.get('secondary_clusters', []))
    secondary_frequency = sum(cluster.get('total_frequency', 0) for cluster in group.get('secondary_clusters', []))
    
    remaining_noise_count = len(group.get('remaining_noise', {}).get('types', []))
    remaining_noise_frequency = group.get('remaining_noise', {}).get('total_frequency', 0)
    
    total_clusters_count += clusters_count
    total_types_count += types_count
    total_frequency += frequency
    total_secondary_clusters_count += secondary_clusters_count
    total_secondary_types_count += secondary_types_count
    total_secondary_frequency += secondary_frequency
    total_remaining_noise_count += remaining_noise_count
    total_remaining_noise_frequency += remaining_noise_frequency

# Iterate through each group and accumulate the statistics
for group_data in data.values():
    count_clusters_types_frequency(group_data)

# Print the aggregated statistics
print(f"Total clusters count: {total_clusters_count}")
print(f"Total types count: {total_types_count}")
print(f"Total frequency: {total_frequency}")
print(f"Total secondary clusters count: {total_secondary_clusters_count}")
print(f"Total secondary types count: {total_secondary_types_count}")
print(f"Total secondary frequency: {total_secondary_frequency}")
print(f"Total remaining noise count: {total_remaining_noise_count}")
print(f"Total remaining noise frequency: {total_remaining_noise_frequency}")

Total clusters count: 1879
Total types count: 5537
Total frequency: 13314
Total secondary clusters count: 434
Total secondary types count: 1859
Total secondary frequency: 3247
Total remaining noise count: 2015
Total remaining noise frequency: 3029
