In [None]:
import csv
import os
import numpy as np
import pandas as pd
import json
from sklearn.metrics.pairwise import cosine_similarity

file_url = './data/20250905_AIMI线上标注正确数据4459549_9070879.json'
with open(file_url, 'r', encoding='utf-8') as f:
    data = json.load(f)

def deduplicate_dicts(dict_list, key):
    seen = set()
    result = []
    for d in dict_list:
        value = d.get(key)
        if value not in seen:
            seen.add(value)
            result.append(d)
    return result

data = deduplicate_dicts(data, 'query')

df = pd.DataFrame(data)
query_list = df['query'].tolist()
# embeddings = get_embeddings(query_list)
# len(embeddings)
rewritten_list = df['rewritten'].tolist()
final_data = [{'query': query, 'rewritten': rewritten}
              for query, rewritten in zip(query_list, rewritten_list)]
len(final_data)

In [None]:
from embeddingtools import get_embeddings

embeddings = get_embeddings(query_list)


In [None]:
# ======================
# K-Means聚类分析模块
# ======================
from embeddingtools import embedding_L2_normalization
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
import os
os.environ["OMP_NUM_THREADS"] = "1"  # 限制OpenMP线程数
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 转换嵌入向量为numpy数组
embeddings_array = embedding_L2_normalization(embeddings)
# 自动确定最佳聚类数（肘部法则）
wcss = []
max_clusters = 50
for i in range(1, max_clusters+1):
    print("进行聚簇数量为",i,"的测试")
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init=10)
    kmeans.fit(embeddings_array)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(16, 6))

# 第一个子图 - 肘部法则
ax1 = plt.subplot(1, 2, 1)
ax1.plot(range(1, max_clusters+1), wcss, marker='o', linestyle='--')
ax1.set_xlabel('簇数量')
ax1.set_ylabel('WCSS')
ax1.set_title('肘部法则确定最佳聚类数')

# 第二个子图 - WCSS差值
ax2 = plt.subplot(1, 2, 2)
diffs = [wcss[i-1] - wcss[i] for i in range(1, len(wcss))]
ax2.bar(range(2, max_clusters+1), diffs, color='orange')
ax2.set_xlabel('簇数量')
ax2.set_ylabel('WCSS差值')
ax2.set_title('相邻簇数WCSS变化量')

plt.tight_layout()
plt.show()

In [None]:
# ======================
# K-Means聚类分析模块
# ======================
from embeddingtools import embedding_L2_normalization
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
import os
os.environ["OMP_NUM_THREADS"] = "1"  # 限制OpenMP线程数
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 用户交互选择聚类数
selected_clusters = int(input("请输入您选择的聚类数量："))
embeddings_array = embedding_L2_normalization(embeddings)
# 执行K-Means聚类
final_kmeans = KMeans(n_clusters=selected_clusters,
                      init='k-means++', random_state=42, n_init=10)
cluster_labels = final_kmeans.fit_predict(embeddings_array)
for i in range(len(data)):
    data[i]['cluster'] = cluster_labels[i]
# 将聚类结果加入数据集
# df['cluster'] = cluster_labels

# 可视化聚类结果（PCA降维）
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embeddings_array)

plt.figure(figsize=(12, 8))
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1],
                      c=cluster_labels, cmap='tab20', alpha=0.7)
plt.title(f'文本聚类可视化（{selected_clusters}个簇）')
plt.xlabel('主成分1')
plt.ylabel('主成分2')
plt.colorbar(scatter, label='簇编号')
plt.grid(True)
plt.show()

# 保存聚类结果
# df.to_csv('data/clustered_results.csv', index=False)

In [None]:
# 可视化聚类结果（PCA三维降维）
pca = PCA(n_components=3)
embeddings_3d = pca.fit_transform(embeddings_array)

plt.figure(figsize=(12, 10))
ax = plt.axes(projection='3d')
scatter = ax.scatter3D(embeddings_3d[:, 0], embeddings_3d[:, 1], embeddings_3d[:, 2],
                       c=cluster_labels, cmap='tab20', alpha=0.7, s=20)
plt.title(f'文本聚类可视化（{selected_clusters}个簇）')
ax.set_xlabel('主成分1')
ax.set_ylabel('主成分2')
ax.set_zlabel('主成分3')
plt.colorbar(scatter, label='簇编号')
ax.grid(True)
plt.show()

In [None]:
import plotly.express as px
import plotly.graph_objects as go

# 采样数据
sample_size = min(5000, len(embeddings_array))
sample_indices = np.random.choice(
    len(embeddings_array), sample_size, replace=False)
sample_embeddings = embeddings_array[sample_indices]
sample_labels = cluster_labels[sample_indices]
sample_queries = [query_list[i] for i in sample_indices]

pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(sample_embeddings)

# 创建交互式散点图
fig = px.scatter(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1],
                 color=sample_labels.astype(str),
                 hover_data={'query': sample_queries},
                 title=f'交互式聚类可视化（{selected_clusters}个簇）',
                 labels={'x': '主成分1', 'y': '主成分2', 'color': '簇编号'})

fig.update_traces(marker=dict(size=4, opacity=0.6))
fig.show()

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

# 统计各簇样本数量
cluster_counts = Counter(cluster_labels)
sorted_counts = sorted(cluster_counts.items(), key=lambda x: x[0])

# 打印统计结果
print(f'总簇数: {len(sorted_counts)}')
# print('各簇样本数量:')
# for cluster_id, count in sorted_counts:
#     print(f'簇{cluster_id}: {count}个样本')

# 绘制柱状图

plt.figure(figsize=(12, 6))
plt.bar([f'簇{id}' for id, _ in sorted_counts],
        [count for _, count in sorted_counts])
plt.xlabel('簇编号')
plt.ylabel('样本数量')
plt.title('用户原始问题的聚类分布')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# 在柱子上方显示具体数值
for i, (_, count) in enumerate(sorted_counts):
    plt.text(i, count+5, str(count), ha='center')

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import jieba
import wordcloud
from wordyun.getStop import get_stopwords

# 配置九个子图布局
fig, axes = plt.subplots(8, 5, figsize=(20, 24))
axes = axes.flatten()
erased_query_list = [item['query'] for item in data]
cluster_queries = {}
for cluster_id, query in zip(cluster_labels, erased_query_list):
    cluster_queries.setdefault(cluster_id, []).append(query)
# 准备九个文本文件路径（示例）
stopwords = get_stopwords()

for i, ax in enumerate(axes):
    cluster_query = cluster_queries[i]
    s = ' '.join(cluster_query)
    ls = jieba.lcut(s)
    text = ' '.join(ls)

    wc = wordcloud.WordCloud(
        font_path='./wordyun/FZKTK.TTF',
        width=400,  # 缩小词云尺寸适应子图
        height=300,
        background_color='white',
        max_words=50,  # 减少最大词汇量
        stopwords=stopwords
    )

    wc.generate(text)
    ax.imshow(wc, interpolation='bilinear')
    ax.set_title(f'簇{i}')
    ax.axis('off')
    # wc.to_file(f"簇_{i}.png")  # 保留单独保存功能

plt.tight_layout()
plt.show()

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd

file_url = 'data/now/jiaoyi.csv'
df_jy = pd.read_csv(file_url)
query_list_jy = df_jy['query'].tolist()
print("交易数据集数目",len(query_list_jy))

file_url = 'data/now/shangyehua.csv'
df_syh = pd.read_csv(file_url)
query_list_syh = df_syh['query'].tolist()
print("商业数据集数目",len(query_list_syh))

file_url = 'data/now/shequ.csv'
df_sq = pd.read_csv(file_url)
query_list_sq = df_sq['query'].tolist()
print("社区数据集数目",len(query_list_sq))

sft_query_list = query_list_jy + query_list_syh + query_list_sq


In [None]:
for idx, query in enumerate(sft_query_list):
    if(query is None):
        print(idx)

In [None]:
%load_ext autoreload
%autoreload 2
from embeddingtools import get_embeddings
sft_embeddings = get_embeddings(sft_query_list)

In [None]:
sft_labels = final_kmeans.predict(sft_embeddings)


In [None]:
import matplotlib.pyplot as plt
from collections import Counter

# 统计各簇样本数量
cluster_counts = Counter(sft_labels)
sorted_counts = sorted(cluster_counts.items(), key=lambda x: x[0])

# 打印统计结果
print(f'总簇数: {len(sorted_counts)}')
# print('各簇样本数量:')
# for cluster_id, count in sorted_counts:
#     print(f'簇{cluster_id}: {count}个样本')

# 绘制柱状图

plt.figure(figsize=(12, 6))
plt.bar([f'簇{id}' for id, _ in sorted_counts],
        [count for _, count in sorted_counts])
plt.xlabel('簇编号')
plt.ylabel('样本数量')
plt.title('SFT数据问题的聚类分布')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# 在柱子上方显示具体数值
for i, (_, count) in enumerate(sorted_counts):
    plt.text(i, count+5, str(count), ha='center')

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# 计算两个列表的分布
cluster_counts = Counter(cluster_labels)
sft_counts = Counter(sft_labels)

# 获取所有可能的簇ID（0-39）
all_clusters = range(40)

# 准备数据
cluster_values = [cluster_counts.get(cluster, 0) for cluster in all_clusters]
sft_values = [sft_counts.get(cluster, 0) for cluster in all_clusters]

# 创建直方图
plt.figure(figsize=(15, 8))
bar_width = 0.35
x_pos = np.arange(len(all_clusters))

plt.bar(x_pos - bar_width/2, cluster_values, bar_width,
        label='Cluster Labels', alpha=0.7, color='blue')
plt.bar(x_pos + bar_width/2, sft_values, bar_width,
        label='SFT Labels', alpha=0.7, color='orange')

plt.xlabel('簇编号 (0-39)')
plt.ylabel('样本数量')
plt.title('Cluster Labels vs SFT Labels 分布对比')
plt.xticks(x_pos, all_clusters)
plt.legend()
plt.grid(True, alpha=0.3)

# 添加数值标签
for i, v in enumerate(cluster_values):
    plt.text(i - bar_width/2, v + 0.5, str(v),
             ha='center', va='bottom', fontsize=8)

for i, v in enumerate(sft_values):
    plt.text(i + bar_width/2, v + 0.5, str(v),
             ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

# 计算分布差异
print("分布差异统计:")
for cluster in all_clusters:
    diff = abs(cluster_counts.get(cluster, 0) - sft_counts.get(cluster, 0))
    print(f"簇 {cluster}: Cluster Labels={cluster_counts.get(cluster, 0)}, SFT Labels={sft_counts.get(cluster, 0)}, 差异={diff}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# 计算两个列表的分布比例
cluster_counts = Counter(cluster_labels)
sft_counts = Counter(sft_labels)

# 计算总样本数
total_cluster = len(cluster_labels)
total_sft = len(sft_labels)

# 获取所有可能的簇ID（0-39）
all_clusters = range(40)

# 计算百分比分布
cluster_percent = [cluster_counts.get(
    cluster, 0) / total_cluster * 100 for cluster in all_clusters]
sft_percent = [sft_counts.get(cluster, 0) /
               total_sft * 100 for cluster in all_clusters]

# 创建比例直方图
plt.figure(figsize=(16, 8))
bar_width = 0.35
x_pos = np.arange(len(all_clusters))

plt.bar(x_pos - bar_width/2, cluster_percent, bar_width,
        label=f'Cluster Labels (n={total_cluster})', alpha=0.7, color='blue')
plt.bar(x_pos + bar_width/2, sft_percent, bar_width,
        label=f'SFT Labels (n={total_sft})', alpha=0.7, color='orange')

plt.xlabel('簇编号 (0-39)')
plt.ylabel('百分比分布 (%)')
plt.title('Cluster Labels vs SFT Labels 百分比分布对比')
plt.xticks(x_pos, all_clusters)
plt.legend()
plt.grid(True, alpha=0.3)

# 添加百分比标签
for i, v in enumerate(cluster_percent):
    plt.text(i - bar_width/2, v + 0.5,
             f'{v:.1f}%', ha='center', va='bottom', fontsize=8)

for i, v in enumerate(sft_percent):
    plt.text(i + bar_width/2, v + 0.5,
             f'{v:.1f}%', ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

# 计算相对差异（百分比点差异）
print("百分比分布差异统计:")
for cluster in all_clusters:
    cluster_pct = cluster_counts.get(cluster, 0) / total_cluster * 100
    sft_pct = sft_counts.get(cluster, 0) / total_sft * 100
    diff_pct = abs(cluster_pct - sft_pct)
    print(f"簇 {cluster}: Cluster={cluster_pct:.1f}%, SFT={sft_pct:.1f}%, 差异={diff_pct:.1f}个百分点")

# 还可以绘制差异图
plt.figure(figsize=(15, 6))
differences = [cluster_percent[i] - sft_percent[i]
               for i in range(len(all_clusters))]

plt.bar(all_clusters, differences, color=[
        'red' if diff > 0 else 'green' for diff in differences], alpha=0.7)
plt.xlabel('簇编号 (0-39)')
plt.ylabel('百分比点差异 (Cluster - SFT)')
plt.title('Cluster Labels 与 SFT Labels 百分比点差异')
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.grid(True, alpha=0.3)

# 添加差异值标签
for i, v in enumerate(differences):
    plt.text(i, v + (0.1 if v > 0 else -0.5), f'{v:+.1f}',
             ha='center', va='bottom' if v > 0 else 'top', fontsize=8)

plt.tight_layout()
plt.show()

In [None]:
test_file_url = './data/batch_evaluate_report_11937.xlsx'
test_data = pd.read_excel(test_file_url)
test_query = test_data['Query'].to_list()
test_embedding = get_embeddings(test_query)


In [None]:
test_labels = final_kmeans.predict(test_embedding)


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# 计算两个列表的分布比例
cluster_counts = Counter(cluster_labels)
test_data_counts = Counter(test_labels)

# 计算总样本数
total_cluster = len(cluster_labels)
total_test = len(test_labels)

# 获取所有可能的簇ID（0-39）
all_clusters = range(40)

# 计算百分比分布
cluster_percent = [cluster_counts.get(
    cluster, 0) / total_cluster * 100 for cluster in all_clusters]
test_data_percent = [test_data_counts.get(cluster, 0) /
               total_test * 100 for cluster in all_clusters]

# 创建比例直方图
plt.figure(figsize=(16, 8))
bar_width = 0.35
x_pos = np.arange(len(all_clusters))

plt.bar(x_pos - bar_width/2, cluster_percent, bar_width,
        label=f'Cluster Labels (n={total_cluster})', alpha=0.7, color='blue')
plt.bar(x_pos + bar_width/2, test_data_percent, bar_width,
        label=f'SFT Labels (n={total_sft})', alpha=0.7, color='orange')

plt.xlabel('簇编号 (0-39)')
plt.ylabel('百分比分布 (%)')
plt.title('Cluster Labels vs SFT Labels 百分比分布对比')
plt.xticks(x_pos, all_clusters)
plt.legend()
plt.grid(True, alpha=0.3)

# 添加百分比标签
for i, v in enumerate(cluster_percent):
    plt.text(i - bar_width/2, v + 0.5,
             f'{v:.1f}%', ha='center', va='bottom', fontsize=8)

for i, v in enumerate(test_data_percent):
    plt.text(i + bar_width/2, v + 0.5,
             f'{v:.1f}%', ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

# 计算相对差异（百分比点差异）
print("百分比分布差异统计:")
for cluster in all_clusters:
    cluster_pct = cluster_counts.get(cluster, 0) / total_cluster * 100
    test_data_pct = test_data_counts.get(cluster, 0) / total_test * 100
    diff_pct = abs(cluster_pct - test_data_pct)
    print(f"簇 {cluster}: Cluster={cluster_pct:.1f}%, SFT={test_data_pct:.1f}%, 差异={diff_pct:.1f}个百分点")

# 还可以绘制差异图
plt.figure(figsize=(15, 6))
differences = [cluster_percent[i] - test_data_percent[i]
               for i in range(len(all_clusters))]

plt.bar(all_clusters, differences, color=[
        'red' if diff > 0 else 'green' for diff in differences], alpha=0.7)
plt.xlabel('簇编号 (0-39)')
plt.ylabel('百分比点差异 (Cluster - Test)')
plt.title('Cluster Labels 与 Test Data 百分比点差异')
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.grid(True, alpha=0.3)

# 添加差异值标签
for i, v in enumerate(differences):
    plt.text(i, v + (0.1 if v > 0 else -0.5), f'{v:+.1f}',
             ha='center', va='bottom' if v > 0 else 'top', fontsize=8)

plt.tight_layout()
plt.show()