In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import datetime as dt

In [16]:
# 下载nltk所需数据
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/notyoursmac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/notyoursmac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/notyoursmac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
# 读取数据
data = pd.read_csv('abcnews-date-text.csv')
# 仅处理前10000条数据
data = data.head(10000)

In [18]:
# 数据预处理
# 删除重复的标题
data.drop_duplicates(subset='headline_text', inplace=True)

In [19]:
# 将发布日期转换为datetime对象，并提取年份和月份
data['publish_date'] = pd.to_datetime(data['publish_date'], format='%Y%m%d')
data['year'] = data['publish_date'].dt.year
data['month'] = data['publish_date'].dt.month

In [20]:
# 停用词预处理和词形还原
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [21]:
def preprocess_text(text):
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

data['headline_text'] = data['headline_text'].apply(preprocess_text)

In [22]:
# 提取特征（使用TF-IDF表示文本）
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)
X_text = vectorizer.fit_transform(data['headline_text'])

# 归一化时间特征
scaler = MinMaxScaler()
X_time = scaler.fit_transform(data[['year', 'month']])

# 将文本和时间特征拼接
X = np.hstack([X_text.toarray(), X_time])

In [23]:
# 选择最佳聚类数量
best_k = 0
best_score = -1
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42).fit(X)
    score = silhouette_score(X, kmeans.labels_)
    if score > best_score:
        best_score = score
        best_k = k
    print(f'K: {k}, Score: {score}')



K: 2, Score: 0.06955977366975778




K: 3, Score: 0.05851680420064857




K: 4, Score: 0.027116269843513962




K: 5, Score: 0.026864440143792138




K: 6, Score: 0.02740730493174471




K: 7, Score: 0.028107710189636956




K: 8, Score: 0.02840465920427161




K: 9, Score: 0.027553172391741574




K: 10, Score: 0.02594783368829999


In [24]:
# 使用K-means进行聚类
kmeans = KMeans(n_clusters=best_k, random_state=42)
clusters = kmeans.fit_predict(X)

# 将聚类结果添加到数据
data['cluster'] = clusters

# 打印每个簇的示例标题
for i in range(best_k):
    print(f'Cluster {i}:')
    print(data[data['cluster'] == i]['headline_text'].head(10))
    print('\n')



Cluster 0:
2180        30 million landmines destroyed worldwide
2181       adelaide international film festival kick
2182                    airpark planned port douglas
2183              alinghi march delayed postponement
2184                     alinghi poised historic win
2185                   flight grounded tokyo airport
2186    amnesty international accuses solomon police
2187          angler asked fish different type catch
2188         anglican church representative meet nth
2189       annan warns cyrus one last chance restore
Name: headline_text, dtype: object


Cluster 1:
0          aba decides community broadcasting licence
1              act fire witness must aware defamation
2             g call infrastructure protection summit
3                   air nz staff aust strike pay rise
4           air nz strike affect australian traveller
5                    ambitious olsson win triple jump
6               antic delighted record breaking barca
7    aussie qualifier stosur wast