<a href="https://colab.research.google.com/github/Wang-Chiawei/Wang-Chiawei.github.io/blob/main/MaintainCategory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [60]:
import pandas as pd
import jieba
import jieba.posseg as pseg
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [61]:
# 設置jieba的繁體中文詞庫
jieba.set_dictionary('dict.txt.big')
jieba.load_userdict('maintain_dict.txt')


Building prefix dict from /content/dict.txt.big ...
DEBUG:jieba:Building prefix dict from /content/dict.txt.big ...
Loading model from cache /tmp/jieba.u501edca284da514cb68b53a20324f4e3.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.u501edca284da514cb68b53a20324f4e3.cache
Loading model cost 1.325 seconds.
DEBUG:jieba:Loading model cost 1.325 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


In [62]:
# 讀取停用詞列表
def load_stopwords(path):
    with open(path, 'r', encoding='utf-8') as file:
        stopwords = set([line.strip() for line in file])
    return stopwords

In [63]:
def preprocess_chinese(text, stopwords):
    words = pseg.cut(text)
    verbs = [word for word, flag in words if flag.startswith('v') and word not in stopwords]
    return ' '.join(verbs) if verbs else text  # 如果没有动词则返回原文本

In [64]:
def load_data(filepath):
    return pd.read_excel(filepath, sheet_name="離子植入機_動作", usecols=["用詞"])


In [65]:
def vectorize_data(data, stopwords):
    # 应用预处理，同时保存动词或原文本到新列
    data['提取動詞'] = data['用詞'].apply(lambda x: preprocess_chinese(x, stopwords))
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(data['提取動詞'])
    return tfidf_matrix

In [66]:
def cluster_data(tfidf_matrix, num_clusters=5):
    km = KMeans(n_clusters=num_clusters)
    km.fit(tfidf_matrix)
    return km.labels_


In [67]:
def cluster_data_with_dbscan(tfidf_matrix, eps=0.5, min_samples=5):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = dbscan.fit_predict(tfidf_matrix)
    return clusters


In [68]:
stopwords = load_stopwords('stop_words.txt')  # 读取停用词
data = load_data("標準語結果v7_第一道處理_離子植入機_20240416.xlsx")  # 读取数据
tfidf_matrix = vectorize_data(data, stopwords)  # 特征提取
# data['分類結果'] = cluster_data(tfidf_matrix, num_clusters=7)
data['分類結果'] = cluster_data_with_dbscan(tfidf_matrix, eps=0.5, min_samples=1) # DBSCAN分群
data.to_excel("classified_data.xlsx", index=False)  # 保存到新的Excel文件