In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import os
import jieba
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
import numpy as np
from sklearn.cluster import KMeans

In [None]:
stop_words_path = "../../stop_words.txt"
# stop_words_path = "../../old_stop_words.txt"
stop_words_f = open(stop_words_path, 'r')
stop_words_content = stop_words_f.read()
#将停用词表转换为list
stop_words_list = stop_words_content.splitlines()
stop_words_f.close()

In [None]:
folder = "../../news/new_process"
corpus = []
distribution = [0 for i in range(6)]
"""
Read all "sports" news and use "jieba" to segment each article into
a string of words seperated by "\\b" 
"""
for file in sorted(os.listdir(folder)):
    try:
        filepath = os.path.join(folder, file)
        f = open(filepath, 'r')
        article = f.read()
        segment_list = jieba.cut(article, cut_all=False)
        splitted = " ".join(segment_list)
        file_class = int(file[0]) - 1
        distribution[file_class] += 1
        corpus.append(splitted)
        f.close()
    except:
        continue

In [None]:
"""
Build TF-IDF using CountVectorizer, splitting words by spaces
"""
# step 1
vectorizer = CountVectorizer(min_df=1, max_df=1.0, token_pattern='\\b\\w+\\b', stop_words=stop_words_list)
# step 2
vectorizer.fit(corpus)
# step 3
X = vectorizer.transform(corpus)
print(X.shape)
# step 4
# tfidf_transformer = TfidfTransformer(norm='l1')
tfidf_transformer = TfidfTransformer()
# step 5
tfidf = tfidf_transformer.fit_transform(X.toarray())
print(tfidf.shape)
# step 6
# for idx, word in enumerate(vectorizer.get_feature_names()):
#     print("{}\t{}".format(word, tfidf_transformer.idf_[idx]))
# step 4
ndarray = tfidf.toarray()
print(ndarray.shape)

## K-means

In [None]:
distribution = [1875, 1498, 1944, 1595, 1786, 1944]
for i in range(len(distribution)):
    distribution[i] += distribution[i-1]

In [None]:
def getGroup(x):
    g = 0
    for i in range(len(distribution)):
        if x > distribution[i]:
            g = i
    return g

In [None]:
n_clusters = [6, 10, 15, 20, 25, 30]
for n in n_clusters:
    kmeans = KMeans(n_clusters=n, random_state=34312, n_init=10, n_jobs=-1).fit(data)
    labels = kmeans.labels_
    dic = {}
    for i in range(n):
        dic[i] = {}
        for j in range(len(distribution)):
            dic[i][j] = 0
    for i in range(len(labels)):
        g = getGroup(i)
        dic[labels[i]][g] += 1
    percents = {}
    size = {}
    for k in dic:
        _max = 0
        total = 0
        for sk in dic[k]:
            total += dic[k][sk]
            _max = max(_max, dic[k][sk])
        if total != 0:
            percent = _max / total
        else:
            percent = -1
        percents[k] = percent
        size[k] = total
    print("------Num of Clusters: {}-------".format(n))
    sum_weighted_purity = 0
    for k in percents:
        sum_weighted_purity += percents[k] * size[k]
    weighted_purity = sum_weighted_purity / len(labels)
    print("------Cluster Distribution------")
    print(size)
    print("------Weighted Purity: {}-------".format(weighted_purity))
    print("------SSE: {}-------".format(kmeans.inertia_))