In [53]:
import json
import random
import re

import jieba
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

jieba.load_userdict('./mydict/mydict.txt')

In [54]:
# 讀取資料
fileAllLines = []
with open('./file/test.txt','r',encoding="utf-8") as fileLine:
    for line in iter(lambda: fileLine.read(1024), ''):
        #print(line)
        fileAllLines.append(line)
        
new_line = ' '.join(fileAllLines)
    
    
# 讀取 stop words
with open(file='./mydict/stop_words.txt', mode='r', encoding='utf-8') as file:
    stop_words = file.read().split('\n')
    
stop_words.append('\n')  ## 文章中有許多分行符號，這邊加入停用字中，可以把它拿掉
stop_words.append('\n\n')
#stop_words




In [55]:
# 去除繁體中文以外的英文、數字、符號
rule = re.compile(r"[^\u4e00-\u9fa5]")

#進行匯入停止字的分詞
seg_stop_words_list = []
seg_words_list = jieba.lcut(new_line)
for term in seg_words_list:
    if term not in stop_words:
        seg_stop_words_list.append(term)
#seg_stop_words_list

In [56]:
class KMeans:
    def cal_dist(self, p0, p1):
        """
        比較兩點的距離
        """
        return np.sqrt(np.sum((p0-p1)**2))
    
    def nearest_cluster_center(self, point, cluster_centers):
        """
        找到距離 point 最近的中心點
        """
        min_dist = float("inf")
        m = cluster_centers.shape[0]
        for i in range(m):
            d = self.cal_dist(point, cluster_centers[i])
            if min_dist > d:
                min_dist = d
        return min_dist 

    def get_centroids(self, datapoints, k):
        """
        K-means++ 演算法，取得初始化中心點
        """
        clusters = np.array([random.choice(datapoints)])
        dist = np.zeros(len(datapoints))
        
        for i in range(k-1):
            sum_dist = 0
            for j, point in enumerate(datapoints):
                dist[j] = self.nearest_cluster_center(point, clusters)
                sum_dist += dist[j]
            
            sum_dist *= random.random()
            for j, d in enumerate(dist):
                sum_dist = sum_dist - d
                if sum_dist <= 0:
                    clusters = np.append(clusters, [datapoints[j]], axis=0)
                    break
        
        return clusters
        
        
    def kmeans_plus_plus(self, datapoints, k=2):
        """
        K-means 演算法
        """
        # 定義資料維度
        d = datapoints.shape[1]
        # 最大的迭代次數
        Max_Iterations = 100

        cluster = np.zeros(datapoints.shape[0])
        prev_cluster = np.ones(datapoints.shape[0])

        cluster_centers = self.get_centroids(datapoints, k)

        iteration = 0
        while np.array_equal(cluster, prev_cluster) is False or iteration > Max_Iterations:
            iteration += 1
            prev_cluster = cluster.copy()

            # 將每一個點做分群
            for idx, point in enumerate(datapoints):
                min_dist = float("inf")
                for c, cluster_center in enumerate(cluster_centers):
                    dist = self.cal_dist(point, cluster_center)
                    if dist < min_dist:
                        min_dist = dist  
                        cluster[idx] = c   # 指定該點屬於哪個分群

            # 更新分群的中心
            for k in range(len(cluster_centers)):
                new_center = np.zeros(d)
                members = 0
                for point, c in zip(datapoints, cluster):
                    if c == k:
                        new_center += point
                        members += 1
                if members > 0:
                    new_center = new_center / members
                cluster_centers[k] = new_center

        return cluster

In [57]:

# 使用 tf-idf 向量化
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(seg_stop_words_list)
tfidf = tfidf.toarray()

In [63]:
k = 2
Kmeans_cluster = KMeans()
speech_cluster_result = Kmeans_cluster.kmeans_plus_plus(tfidf, k)
cluster = [[] for _ in range(k)]

for idx, c in enumerate(speech_cluster_result):
    cluster[int(c)].append(seg_stop_words_list[idx])
    
for c, result in enumerate(cluster):
    print('Cluster {}: {}'.format(c, ' '.join(result))[0:200])

Cluster 0: 第一次 發文 手機 排版 請見諒 患有 憂鬱症 存在 感到 餘 不知道 活著 意義是 每天 行屍 走 肉 活著 不斷 嘲笑 發音 問題 長 大些 嘲笑 長 相 想 結束 站 陽台 吹 吹 風 不行 認為 想 跳下去 然後就 抓 回來 打算 站 陽台 圍牆 想 跳下去 是否 死 想到 會嚇 許多人 作罷 話 想 吞藥來 結束 心態 扭曲 造成 可逆 心理 傷害 憂鬱症 恐慌 症
Cluster 1: 我四歲
