In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups

# 定义测试集1
categories1 = ['comp.graphics', 'rec.sport.baseball', 'sci.med']
data1 = fetch_20newsgroups(categories=categories1, subset='all', shuffle=True, random_state=42)

# 定义测试集2
categories2 = ['talk.politics.misc', 'rec.autos', 'sci.space']
data2 = fetch_20newsgroups(categories=categories2, subset='all', shuffle=True, random_state=42)

# 提取特征向量
vectorizer = TfidfVectorizer()
X1 = vectorizer.fit_transform(data1.data)
X2 = vectorizer.transform(data2.data)

# K-means聚类
k = len(data1.target_names)  # 类别数
kmeans1 = KMeans(n_clusters=k, random_state=42)
kmeans2 = KMeans(n_clusters=k, random_state=42)
kmeans1.fit(X1)
kmeans2.fit(X2)

# 预测聚类标签
labels1 = kmeans1.labels_
labels2 = kmeans2.labels_

# 显示聚类结果
print("Test Set 1 Clustering Result:")
for i in range(k):
    cluster_docs = [data1.data[j] for j in range(len(data1.data)) if labels1[j] == i]
    print(f"Cluster {i}:")
    for doc in cluster_docs[:5]:  # 打印每个聚类簇中的前5个文档
        print(doc)
    print()

print("Test Set 2 Clustering Result:")
for i in range(k):
    cluster_docs = [data2.data[j] for j in range(len(data2.data)) if labels2[j] == i]
    print(f"Cluster {i}:")
    for doc in cluster_docs[:5]:  # 打印每个聚类簇中的前5个文档
        print(doc)
    print()

# 计算聚类准确度
accuracy1 = accuracy_score(data1.target, labels1)
accuracy2 = accuracy_score(data2.target, labels2)

# 打印聚类准确度
print("Test Set 1 Clustering Accuracy:", accuracy1)
print("Test Set 2 Clustering Accuracy:", accuracy2)


Test Set 1 Clustering Result:
Cluster 0:
From: geb@cs.pitt.edu (Gordon Banks)
Subject: Re: OB-GYN residency
Reply-To: geb@cs.pitt.edu (Gordon Banks)
Organization: Univ. of Pittsburgh Computer Science
Lines: 28

In article <1993Apr12.231544.5990@cnsvax.uwec.edu> nyeda@cnsvax.uwec.edu (David Nye) writes:

> 
>I believe it is illegal for a residency to discriminate against FMGs.  I


Is that true?  I know some that won't even interview FMGs.  
Most programs discriminate, in that given an FMG equally
qualified as an American they will take the American.  What
rights do they actually have?  Does it matter if they are
US citizens (most are not)?  We have had good luck with FMGs
and bad luck.  SOme of our very best residents have been FMGs.
Also, our very worst.  As it turns out, the worst FMGs are often
US citizens that studied in off-shore medical schools.  Of the
5 residents fired for incompetence in the 12 years I've been here 
in my department, all have been FMGs.  3 were US citizens who

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups

# 定义测试集1
categories1 = ['comp.graphics', 'rec.sport.baseball', 'sci.med']
data1 = fetch_20newsgroups(categories=categories1, subset='all', shuffle=True, random_state=42)

# 定义测试集2
categories2 = ['talk.politics.misc', 'rec.autos', 'sci.space']
data2 = fetch_20newsgroups(categories=categories2, subset='all', shuffle=True, random_state=42)

# 提取特征向量
vectorizer = TfidfVectorizer()
X1 = vectorizer.fit_transform(data1.data)
X2 = vectorizer.transform(data2.data)

# K-means聚类
k = len(data1.target_names)  # 类别数
kmeans1 = KMeans(n_clusters=k, random_state=42)
kmeans2 = KMeans(n_clusters=k, random_state=42)
kmeans1.fit(X1)
kmeans2.fit(X2)

# 预测聚类标签
labels1 = kmeans1.labels_
labels2 = kmeans2.labels_

# 显示聚类结果和文档标签
print("Test Set 1")
print("--------------")
for i in range(len(data1.data)):
    print(f"Document: {i+1}\tCluster: {labels1[i]}\tLabel: {data1.target[i]}")

print("\nTest Set 2")
print("--------------")
for i in range(len(data2.data)):
    print(f"Document: {i+1}\tCluster: {labels2[i]}\tLabel: {data2.target[i]}")

# 计算聚类准确度
accuracy1 = accuracy_score(data1.target, labels1)
accuracy2 = accuracy_score(data2.target, labels2)

# 打印聚类准确度
print("Test Set 1 Clustering Accuracy:", accuracy1)
print("Test Set 2 Clustering Accuracy:", accuracy2)

Test Set 1
--------------
Document: 1	Cluster: 0	Label: 2
Document: 2	Cluster: 2	Label: 0
Document: 3	Cluster: 1	Label: 1
Document: 4	Cluster: 0	Label: 2
Document: 5	Cluster: 1	Label: 2
Document: 6	Cluster: 1	Label: 0
Document: 7	Cluster: 2	Label: 1
Document: 8	Cluster: 1	Label: 2
Document: 9	Cluster: 1	Label: 2
Document: 10	Cluster: 2	Label: 2
Document: 11	Cluster: 1	Label: 1
Document: 12	Cluster: 2	Label: 2
Document: 13	Cluster: 1	Label: 0
Document: 14	Cluster: 1	Label: 1
Document: 15	Cluster: 1	Label: 2
Document: 16	Cluster: 1	Label: 0
Document: 17	Cluster: 1	Label: 0
Document: 18	Cluster: 1	Label: 2
Document: 19	Cluster: 1	Label: 1
Document: 20	Cluster: 1	Label: 2
Document: 21	Cluster: 1	Label: 0
Document: 22	Cluster: 2	Label: 0
Document: 23	Cluster: 1	Label: 2
Document: 24	Cluster: 1	Label: 0
Document: 25	Cluster: 2	Label: 1
Document: 26	Cluster: 1	Label: 2
Document: 27	Cluster: 2	Label: 1
Document: 28	Cluster: 1	Label: 0
Document: 29	Cluster: 1	Label: 1
Document: 30	Cluster: 1	La

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups

# 定义测试集1
categories1 = ['comp.graphics', 'rec.sport.baseball', 'sci.med']
data1 = fetch_20newsgroups(categories=categories1, subset='all', shuffle=True, random_state=42)

# 定义测试集2
categories2 = ['talk.politics.misc', 'rec.autos', 'sci.space']
data2 = fetch_20newsgroups(categories=categories2, subset='all', shuffle=True, random_state=42)

# 提取特征向量
vectorizer = TfidfVectorizer()
X1 = vectorizer.fit_transform(data1.data)
X2 = vectorizer.transform(data2.data)

# K-means聚类
k = len(data1.target_names)  # 类别数
kmeans1 = KMeans(n_clusters=k, random_state=42)
kmeans2 = KMeans(n_clusters=k, random_state=42)
kmeans1.fit(X1)
kmeans2.fit(X2)

# 预测聚类标签
labels1 = kmeans1.labels_
labels2 = kmeans2.labels_

# 显示聚类结果
print("测试集1聚类结果：")
for i in range(k):
    cluster_docs_indices = [j for j in range(len(data1.data)) if labels1[j] == i]
    print(f"聚类 {i}:")
    print("前5个文档序号：", cluster_docs_indices[:5])  # 打印每个聚类簇中的前5个文档序号
    print()

print("测试集2聚类结果：")
for i in range(k):
    cluster_docs_indices = [j for j in range(len(data2.data)) if labels2[j] == i]
    print(f"聚类 {i}:")
    print("前5个文档序号：", cluster_docs_indices[:5])  # 打印每个聚类簇中的前5个文档序号
    print()

# 计算聚类准确度
accuracy1 = accuracy_score(data1.target, labels1)
accuracy2 = accuracy_score(data2.target, labels2)

# 打印聚类准确度
print("测试集1聚类准确度：", accuracy1)
print("测试集2聚类准确度：", accuracy2)


测试集1聚类结果：
聚类 0:
前5个文档序号： [0, 3, 31, 37, 70]

聚类 1:
前5个文档序号： [2, 4, 5, 7, 8]

聚类 2:
前5个文档序号： [1, 6, 9, 11, 21]

测试集2聚类结果：
聚类 0:
前5个文档序号： [0, 2, 5, 7, 8]

聚类 1:
前5个文档序号： [1, 4, 10, 11, 12]

聚类 2:
前5个文档序号： [3, 6, 17, 28, 34]

测试集1聚类准确度： 0.30503889076766993
测试集2聚类准确度： 0.24091569767441862
