In [3]:
import os
import jieba
import pandas

In [4]:
train_data_dirpath = 'nlp_news_datasets/chinese_news_trans'
test_data_dirpath = 'nlp_news_datasets/chinese_news_test'

In [5]:
def readfile_flow(target_dirpath):
    rows = []
    for dirpath, dirnames, filenames in os.walk(target_dirpath):
        # print(dirpath, dirnames, filenames)
        for filename in filenames:
            if filename == '_DS_Store':
                continue
            category = dirpath.split('/')[-1]
            fp = os.path.join(dirpath, filename)
            with open(fp, 'r', encoding='utf-8') as f:
                content = f.read()
            content = ' '.join(jieba.cut(content))
            content = content.replace('\r', '').replace('\n', '')
            rows.append([category, content])
    return pandas.DataFrame(rows, columns=['category', 'content'])

In [6]:
train_df = readfile_flow(train_data_dirpath)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.857 seconds.
Prefix dict has been built succesfully.


In [34]:
train_df['category'] = train_df['category'].astype('category')
category_list = train_df['category'].astype('category').cat.categories.tolist()
category_map = {data: i for i, data in enumerate(category_list)}
train_df['category'] = train_df['category'].replace(category_map)
print(category_map)
train_df.head()

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9}


Unnamed: 0,category,content
0,5,【 日 期 】 19960516 【 版 號 】 11 ...
1,5,（ 兼 晚報 ） 我國 郵集 首次 在 世界 郵...
2,5,121 . 大型 民族 音樂劇 《 劉胡蘭 的 故事 》 時間...
3,5,11 . 濟 南市 收藏 展 收藏 ， 具有 較 高 的 文化 品位 與 內涵 ...
4,5,全國 部分 革命 老區 攝 影展 今天 在 京 ...


In [31]:
category_series = train_df['category'].astype('category')
print(category_series.cat.categories)
print(category_series.cat.codes.head())

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')
0    5
1    5
2    5
3    5
4    5
dtype: int8


In [32]:
test_df = readfile_flow(test_data_dirpath)

In [33]:
test_df['category'] = test_df['category'].astype('category')
test_df['category'] = test_df['category'].replace(category_map)
test_df.head()

Unnamed: 0,category,content
0,5,2000 - 02 - 20 大型 綜藝 晚會 浪費 現象 透視 ...
1,5,文化 事業 健康 繁榮 “ 九五 ” 期間 ， 我...
2,5,臧天朔 （ 相關 作品 ） 姓名 ： 臧天朔 1964 年 3 月...
3,5,- - - - - - - - - - - - - - - - - - - - - - - ...
4,5,宣武 區 “ 十五 ” 期間 文化 事業 發展 計劃 一 、 “ 九五 ” 時期...


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
vec = TfidfVectorizer()

In [46]:
ttt_bag = vec.fit(train_df['content'])
ttt_bag

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [37]:
bag = vec.fit_transform(train_df['content'])

In [38]:
print("總共維度:", len(vec.get_feature_names()))

總共維度: 99364


In [39]:
from sklearn.naive_bayes import MultinomialNB

In [40]:
clf = MultinomialNB(alpha=0.001)
clf.fit(bag, train_df['category'])

MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)

In [41]:
test_bag = vec.transform(test_df['content'])

In [42]:
predict = clf.predict(test_bag)
predict

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6])

In [44]:
from sklearn.metrics import accuracy_score

In [45]:
print("預測:", list(predict))
print("正確標籤:", list(test_df['category']))
print("Naive-Bayes 正確率: ", accuracy_score(test_df['category'], predict) * 100, "%")

預測: [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
正確標籤: [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
Naive-Bayes 正確率:  100.0 %
