# 导入数据

In [1]:
from sklearn.datasets import fetch_20newsgroups # 导入sklearn package
news = fetch_20newsgroups(subset="all")

# 划分训练集与测试集

In [2]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25) # 划分训练集与测试集

# 数据预处理
创建wordlist
* 全部转化为小写
* 去掉标点符号
* 去掉换行符
* 将数据以空格划分

In [3]:
import string
from collections import Counter
table = str.maketrans(dict.fromkeys(string.punctuation))

word_set = set()
counter_per_cat = [Counter() for i in range(len(news.target_names))]  # 创建文章类别个数的Counter()，计算每个单词在所有文档类别出现的频率 
for sample_id, sample in enumerate(x_train):
    sample_label = y_train[sample_id]
    words = str(sample).lower().translate(table).strip().split()#lower()转化为小写 translate()去掉标点符号 strip()去掉换行符 split()将数据以空格划分
    counter_per_cat[sample_label].update(words)
    word_set.update(words)
word_list = list(word_set)

# Classifier learning
* prob_mat: 类条件概率
    * size of[label_num（类别数）, word_num（Vocabulary大小）]
    * 类条件概率
* Get expected statistics from the training data
    * total_freq : total word frequency in articles with certain label（具有特定标签的文章的总词频）
    * label_prob : the class prior probabilities of training data（先验概率）
    * empty_prob : for word doesn’t appear in training data（处理没有出现在训练数据中的词）
    * prob_mat : probability of each word appears in each article label
* Use these statistics to classify articles in the test data

In [4]:
total_freq = [] # 每类文章的总词频：nj
label_prob = [] # 先验概率
empty_prob = [] # 没有出现在训练数据中的词
prior_prob = Counter(y_train)
for label_id, label in enumerate(news.target_names):
    total_freq.append(sum(counter_per_cat[label_id].values())) # 每类文章的总词频：nj
    label_prob.append(prior_prob[label_id]/len(y_train)) # 先验概率
    empty_prob.append((1)/(total_freq[label_id] + len(word_list))) # 没有出现在训练数据中的词

import numpy as np
prob_mat = np.zeros((len(news.target_names), len(word_set))) # 类条件概率
for label_id, label in enumerate(news.target_names):
    freq_list = counter_per_cat[label_id]
    for word_id, word in enumerate(word_set):
        if word not in freq_list:
            freq_label = 0
        else:
            freq_label = freq_list[word]
        freq_all = total_freq[label_id]
        prob = (freq_label + 1) / (freq_all + len(word_set)) # 计算类条件概率
        prob_mat[label_id, word_id] = prob

# News article classification
* The steps for testing:
    * Tokenize the articles in test data (数据预处理)
    * Calculate the probability of the article belongs to each label（计算文章属于每一类的概率）
    * Classify it with the highest one（将文章分类为上面计算概率最大的类）
* Issue:
    * To avoid the product of probabilities getting too close to0, we use log likelihood equation: Convert product to addition

In [None]:
import math
from tqdm import tqdm
label_prob = np.log(np.array(label_prob))
empty_prob = np.log(np.array(empty_prob))
prob_mat = np.log(np.array(prob_mat))
predict_labels = np.zeros(len(x_test), dtype = int)
for sample_id, sample in tqdm(enumerate(x_test), total = len(x_test)):
    probs = np.zeros(len(news.target_names))
    words = str(sample).lower().translate(table).strip().split() # Tokenize
    sample_len = len(words)
    word_freq = Counter(words)
    words = list(set(words).intersection(set(word_list))) # 返回words和word_list的交集
    for label_id, label in enumerate(news.target_names): # Calculate the probality for each category
        prob_label = label_prob[label_id]
        len_a = 0
        for word in words:
            word_id = word_list.index(word)
            prob_cur = prob_mat[label_id, word_id]
            prob_label += word_freq[word] * (prob_cur)
            len_a += word_freq[word]
        len_b = sample_len - len_a
        if len_b > 0:
            prob_label += len_b * (empty_prob[label_id])
        probs[label_id] = prob_label
    predict_label = np.argmax(probs) # Category with the highest probality
    predict_labels[sample_id] = predict_label

  5%|███▉                                                                         | 238/4712 [21:04<8:37:40,  6.94s/it]

# Results
* Calculate the accuracy of the prediction among all of the categories

In [None]:
# accuracy of the prediction among all of the categories
comp = predict_labels - np.array(y_test)
accuracy = (len(np.where(comp == 0)[0])) / len(x_test)
print(accuracy)

# Detailed accuracy
y_test = np.array(y_test)
for i in range(len(news.target_names)):
    cat_data_num = (np.where(y_test == i))[0].shape[0]
    pred_cat_tmp = predict_labels[np.where(y_test == i)]
    pred_correct = np.where(pred_cat_tmp == i)[0].shape[0]
    accuracy_per_cat = pred_correct/cat_data_num
    print(news.target_names[i])
    print("total data number of this category: " + str(cat_data_num))
    print("number of correctly prediction: " + str(pred_correct))
    print("predicition accuracy for this category: " + str(accuracy_per_cat))