In [1]:
import os
import glob
import pandas as pd

#打包後，只需要丟入檔案路徑dn即可
def get_data(dn):
    data = {
        "news":[],
        "ans":[]    
    }
    # 用glob模組找尋檔案路徑
    dirs = glob.glob(os.path.join(dn,"*"))
    for d in dirs:
        # 找到底下所有副檔名為txt檔
        # 如果是mac的話，則用".[tT][xX][tT]"找尋
        # 因為windows會不分大小寫自動搜尋，也不會去找隱藏檔
        pat = os.path.join(d, "*.txt")
        fs = glob.glob(pat)
        for fp in fs:
            with open(fp, "r", encoding="utf-8") as f:
                news = f.read()
            ans = os.path.split(d)[-1] # 切割完後的路徑，取最後檔案名稱
            data["news"].append(news)
            data["ans"].append(ans)
    df = pd.DataFrame(data, columns=["news", "ans"])
    return df

In [2]:
# 丟入dn測試抓不抓得到資料
train_df = get_data("chinese_news_trans")
test_df = get_data("chinese_news_test")

In [3]:
# 準備答案，轉化成數字編號
# unique 排序
# value_counts 算次數

# kind = train_df["ans"].value_counts()
kind = train_df["ans"].unique()

# enumerate 加上編號
trains = {k:i for i,k in enumerate(kind)}
reverse_trains = {i:k for i,k in enumerate(kind)}

In [4]:
y_train = train_df["ans"].replace(trains)
y_test = test_df["ans"].replace(trains)

In [5]:
y_test

0      0
1      0
2      0
3      0
4      0
      ..
96     9
97     9
98     9
99     9
100    9
Name: ans, Length: 101, dtype: int64

In [6]:
import os
from urllib.request import urlretrieve
DICT_PATH = "dict.txt.big"
if not os.path.exists(DICT_PATH):
    print("字典不存在")
    url = "https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big"
    urlretrieve(url, DICT_PATH)

In [7]:
import jieba
# apple
def newscut(news):
    # 對每個斷詞都加上空格
    s = " ".join(jieba.cut(news))
    return s.replace("\r", "").replace("\n", "")
x_train_raw = train_df["news"].apply(newscut)
X_test_raw = test_df["news"].apply(newscut)
# print(train)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\user\AppData\Local\Temp\jieba.cache
Loading model cost 0.693 seconds.
Prefix dict has been built successfully.


In [8]:
# 計算次數
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
x_train = vec.fit_transform(x_train_raw)
x_test = vec.transform(X_test_raw)

In [9]:
# 語言請用單純貝氏
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
# 預測正確率
from sklearn.metrics import accuracy_score
pre = clf.predict(x_test)
accuracy_score(pre, y_test)

1.0

In [11]:
from sklearn.metrics import  confusion_matrix
mat = confusion_matrix(y_test, pre)
col = ["{}(預測)".format(s) for s in kind]
row = ["{}(正確)".format(s) for s in kind]
pd.DataFrame(mat, columns=col, index=row)

Unnamed: 0,交通(預測),政治(預測),教育(預測),環境(預測),經濟(預測),藝術(預測),計算機(預測),軍事(預測),醫藥(預測),體育(預測)
交通(正確),10,0,0,0,0,0,0,0,0,0
政治(正確),0,11,0,0,0,0,0,0,0,0
教育(正確),0,0,10,0,0,0,0,0,0,0
環境(正確),0,0,0,10,0,0,0,0,0,0
經濟(正確),0,0,0,0,10,0,0,0,0,0
藝術(正確),0,0,0,0,0,10,0,0,0,0
計算機(正確),0,0,0,0,0,0,10,0,0,0
軍事(正確),0,0,0,0,0,0,0,10,0,0
醫藥(正確),0,0,0,0,0,0,0,0,10,0
體育(正確),0,0,0,0,0,0,0,0,0,10


In [22]:
n = input("請輸入新聞:")
test = vec.transform([newscut(n)])
# print(test)
pre = clf.predict(test)[0]
# print(pre)
ans = reverse_trains[pre]
print("")
print("應該是:", ans)
print("")
proba = clf.predict_proba(test)[0]
# print(proba)
proba = list(enumerate(proba))
for i,prob in sorted(proba, key=lambda x:x[1], reverse=True):
    print(reverse_trains[i], ":", round(prob*100, 2), "%")

請輸入新聞:烏克蘭車諾比核電廠附近的禁區於本月4日發生森林大火，至今已延燒11天，有知情人士透露，這場大火距核電廠與放射性廢料儲藏區僅剩2公里。對此，烏克蘭國家緊急服務處表示，雖然控制火勢的難度很大，但禁區內的關鍵地點目前還沒有受到威脅。不過有綠色和平組織成員指出，這場大火的嚴重性超過了官方估計。

應該是: 軍事

軍事 : 96.97 %
經濟 : 1.8 %
環境 : 1.22 %
體育 : 0.01 %
政治 : 0.0 %
計算機 : 0.0 %
交通 : 0.0 %
藝術 : 0.0 %
教育 : 0.0 %
醫藥 : 0.0 %


In [None]:
list(zip([1,2,3],[4,5,3]))

In [None]:
sorted([(3,"a"),(1,"b"),(5,"c")])