In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
from sklearn.model_selection import train_test_split


%config InlineBackend.figure_format = 'svg'

plt.rcParams['font.sans-serif'] = ['Kaiti']
plt.rcParams['axes.unicode_minus'] = False

# 文本类型分类

In [None]:
df_technology = pd.read_csv(r'technology_news.csv', encoding='utf-8')
df_technology = df_technology.dropna()

df_car = pd.read_csv(r'car_news.csv', encoding='utf-8')
df_car = df_car.dropna()

df_entertainment = pd.read_csv(r'entertainment_news.csv', encoding='utf-8')
df_entertainment = df_entertainment.dropna()

df_military = pd.read_csv(r'military_news.csv', encoding='utf-8')
df_military = df_military.dropna()

df_sports = pd.read_csv(r'sports_news.csv', encoding='utf-8')
df_sports = df_sports.dropna()

In [None]:
# 导入分词库
import jieba

In [None]:
jieba.lcut('今天天气真不错', cut_all=True)   #　cut_all=True: 全模式，有可能的全部分出

In [None]:
jieba.lcut('今天天气真不错')  # 精确模式

In [None]:
a = jieba.lcut('今天天气真不错')  # 精确模式

In [None]:
[' '.join(a)]

In [None]:
#  jieba.load_userdict('../resources/文档/data/中文停用词库.txt') # 导入停词库，里面的词就不会分开

In [None]:
# 把dataframe中的字段content分词

In [None]:
df_entertainment.content.apply(lambda x: ' '.join(jieba.lcut(x)))

In [None]:
# 将数据保存成列表形式
car = df_car.content.tolist()
entertainment = df_entertainment.content.tolist()
sports = df_sports.content.tolist()
technology = df_technology.content.tolist()
military = df_military.content.tolist()

In [None]:
datas = ['car', 'entertainment', 'sports', 'technology', 'military']  
samples = []
for data in datas:
    for x in eval(data):
       
        str_ = jieba.lcut(x) # 分词
        str_ = filter(lambda x:len(x)>1,str_) # 过滤字符串长度小于1的字符串。
        samples.append((" ".join(str_), data)) # 整合+标签data

In [None]:
# 将数据打乱(非必要)
import random

random.shuffle(samples)

In [None]:
samples

In [None]:
X, y = zip(*samples)  # 将samples解包，分别给X,y

In [None]:
# 划分数据集
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [None]:
# 词频特征向量

from sklearn.feature_extraction.text import CountVectorizer     # 词频矩阵

In [None]:
# 停用词表
stopwords=pd.read_csv(r'stopwords.txt',index_col=False,quoting=3,sep='\t',names=['stopword'],encoding='utf-8')
stopwords=stopwords['stopword'].values.tolist()

In [None]:
# 实例化, 构建词袋，特征数6000个，去掉了停用词
vec = CountVectorizer(stop_words=stopwords,).fit(x_train)   #  max_features=6000: 默认全部


In [None]:
x_train_ = vec.transform(x_train)
x_test_ = vec.transform(x_test)

In [None]:
x_train_

In [None]:
# 朴素贝叶斯,不同的用法不一样
from sklearn.naive_bayes import MultinomialNB   # 多项式
from sklearn.naive_bayes import BernoulliNB     # 伯努利
from sklearn.naive_bayes import GaussianNB      # 高斯
from sklearn.naive_bayes import ComplementNB    # 补集

In [None]:
# 训练
nb = MultinomialNB().fit(x_train_, y_train)

In [None]:
nb.score(x_train_, y_train)

In [None]:
nb.score(x_test_, y_test)

In [None]:
# 测试预测结果

In [None]:

s = '特斯拉降价了，真烦人'
a = [' '.join(jieba.lcut(s))]

In [None]:
a_ = vec.transform(a)

In [None]:
nb.predict(a_)

In [None]:
# 使用TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf_idf = TfidfVectorizer(stop_words=stopwords).fit(x_train)

In [None]:
x_train_tf = tf_idf.transform(x_train)
x_test_tf = tf_idf.transform(x_test)

In [None]:
nb_tf = MultinomialNB().fit(x_train_tf, y_train)

In [None]:
nb_tf.score(x_train_tf, y_train)

In [None]:
nb_tf.score(x_test_tf, y_test)

In [None]:
# 预测结果
nb_tf.predict(x_test_tf)

In [None]:
# 返回预测结果落在那个特征的可能性
nb_tf.predict_proba(x_test_tf)