In [103]:
import re
import pandas as pd
import numpy as np

In [104]:
news_path = "data/sqlResult_1558435.txt"

In [105]:
def pretreat_news(news_file):
    '''
    对于新闻语料库进行预处理，提取出新闻正文以及给正文打标签
    '''
    file = open(news_file,encoding='gbk').read()
    
    ret = re.compile(r'"\d+",".*?","(.*?)","([\s\S]*?)"')
    
    for g in ret.finditer(file):
        
        publisher = g.group(1)
        if '新华社' in publisher:
            tag = 1
        else:
            tag = 0
        
        news_content = g.group(2)
        news_content = news_content.replace('\n','')
        news_content = news_content.replace(" ","")
        news_content = news_content.replace("\s","")
        news_content = news_content.replace("\t","")
        news_content = news_content.replace("\u3000","")
        news_content = news_content.replace("\b","").strip()
        
        yield tag,news_content

In [106]:
def save_file(news_path,save_path):
    '''
    将处理好的新闻数据进行保存 方便之后使用
    '''
    output = open(save_path,"w+",encoding="utf-8")
    for tag,news in pretreat_news(news_path):
        if len(news) == 0 :
            continue
        else:
            output.write(str(tag) + " " + news + "\n")
    output.close()

In [107]:
save_file(news_path,"data/news_data")

In [109]:
# 使用pandas加载新闻数据
data = pd.read_csv("data/news_data",sep=" ",names=["分类","正文"])

In [110]:
data.head()

Unnamed: 0,分类,正文
0,0,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...
1,0,骁龙835作为唯一通过Windows10桌面平台认证的ARM处理器，高通强调，不会因为只考虑...
2,0,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。至于电池...
3,1,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄
4,0,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）@深圳交警微博称：昨日清晨交警发...


In [111]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87054 entries, 0 to 87053
Data columns (total 2 columns):
分类    87054 non-null int64
正文    87054 non-null object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


### 对于新闻正文进行分词处理

In [112]:
import jieba

In [113]:
jieba.enable_parallel() # 开启并行分词
data['文本分词'] = data['正文'].apply(lambda i: jieba.cut(i))
data['文本分词'] = [' '.join(i) for i in data['文本分词']]
data.head()

Unnamed: 0,分类,正文,文本分词
0,0,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,此外 ， 自 本周 （ 6 月 12 日 ） 起 ， 除 小米 手机 6 等 15 款 机型...
1,0,骁龙835作为唯一通过Windows10桌面平台认证的ARM处理器，高通强调，不会因为只考虑...,骁龙 835 作为 唯一 通过 Windows10 桌面 平台 认证 的 ARM 处理器 ，...
2,0,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。至于电池...,此前 的 一加 3T 搭载 的 是 3400mAh 电池 ， DashCharge 快充 规...
3,1,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄,这是 6 月 18 日 在 葡萄牙 中部 大 佩德罗 冈 地区 拍摄 的 被 森林 大火 烧...
4,0,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）@深圳交警微博称：昨日清晨交警发...,（ 原 标题 ： 44 岁 女子 跑 深圳 约会 网友 被 拒 ， 暴雨 中 裸身 奔走 …...


Process ForkPoolWorker-7:
Process ForkPoolWorker-8:
Process ForkPoolWorker-5:
Process ForkPoolWorker-6:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/liyehong/anaconda3/envs/tensoflow/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/Users/liyehong/anaconda3/envs/tensoflow/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/liyehong/anaconda3/envs/tensoflow/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/liyehong/anaconda3/envs/tensoflow/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/liyehong/anaconda3/envs/tensoflow/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/Users/liyehong/anaconda3/envs/tensoflow/lib/python3.7/multiprocessing/pool.py", line 1

### 将数据集划分成训练集和测试集

In [114]:
y = data['分类']

In [117]:
from sklearn.model_selection import train_test_split

In [119]:
xtrain,xvalid,ytrain,yvalid = train_test_split(data.文本分词.values,y,
                                               stratify=y,random_state=42,test_size=0.1,shuffle=True)

In [120]:
print(xtrain.shape)
print(ytrain.shape)
print(xvalid.shape)
print(yvalid.shape)

(78348,)
(78348,)
(8706,)
(8706,)


### 使用TF-IDF和逻辑回归构建模型

In [128]:
def number_normalizer(tokens):
    '''
    将所有数据标记映射为一个占位符(Placeholder)。
    将所有的数字都表示成同一个符号已达到降维的目的。
    '''
    return ("#NUMBER" if token[0].isdigit() else token for token in tokens)

In [129]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [130]:
class NumberNormalizingVectorizer(TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super(NumberNormalizingVectorizer,self).build_tokenizer()
        return lambda doc: list(number_normalizer(tokenize(doc)))

In [132]:
stopwords = [line.strip() for line in open('data/哈工大停用词表.txt','r',encoding='utf-8').readlines()]

In [133]:
tfv = NumberNormalizingVectorizer(min_df=3,max_df=0.5,max_features=None,ngram_range=(1,2),
                            use_idf=True,smooth_idf=True,stop_words=stopwords)

In [169]:
# 使用TF-IDF来fit训练集
tfv.fit(list(xtrain))
xtrain_tfv = tfv.transform(xtrain)
xvalid_tfv = tfv.transform(xvalid)

In [135]:
from sklearn.linear_model import LogisticRegression

In [170]:
#利用提取的TFIDF特征来fit一个简单的Logistic Regression
clf = LogisticRegression(C=1.0,solver='liblinear',multi_class='ovr')
clf.fit(xtrain_tfv,ytrain)
predictions = clf.predict_proba(xvalid_tfv)

In [171]:
predictions

array([[0.95109223, 0.04890777],
       [0.04382422, 0.95617578],
       [0.14913136, 0.85086864],
       ...,
       [0.04151485, 0.95848515],
       [0.00139134, 0.99860866],
       [0.0092382 , 0.9907618 ]])

In [172]:
def get_y_predict(predictions):
    return [1 if prob[1] >= 0.5 else 0 for prob in predictions]

In [173]:
y_predict_TFidf = get_y_predict(predictions)

### 对模型进行评估

In [150]:
from sklearn.metrics import accuracy_score

In [154]:
y_actual = list(yvalid)

In [174]:
print("测试集准确率：{}".format(accuracy_score(y_actual,y_predict_TFidf)))

测试集准确率：0.9640477831380657


### 使用CountVectorizer和Logistics Regression构建模型

In [None]:
ctv = CountVectorizer(min_df=3,
                      max_df=0.5,
                      ngram_range=(1,2),
                      stop_words = stwlist)

# 使用Count Vectorizer来fit训练集和测试集（半监督学习）
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

#利用提取的word counts特征来fit一个简单的Logistic Regression 

clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='multinomial')
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

In [157]:
from sklearn.feature_extraction.text import CountVectorizer

In [158]:
ctv = CountVectorizer(min_df=3,max_df=0.5,ngram_range=(1,2),stop_words=stopwords)

In [164]:
ctv.fit(list(xtrain))
xtrain_ctv =  ctv.transform(xtrain)
xvalid_ctv = ctv.transform(xvalid)

In [177]:
clf = LogisticRegression(C=1.0,solver='liblinear',multi_class='ovr')
clf.fit(xtrain_ctv, ytrain)
predictions_CV = clf.predict_proba(xvalid_ctv)

In [178]:
y_predict_CV = get_y_predict(predictions_CV)

In [179]:
print("测试集准确率：{}".format(accuracy_score(y_actual,y_predict_CV)))

测试集准确率：0.9886285320468642


比使用TF-idf高了2%，效果更好

### 使用TF-IDF和朴素贝叶斯

In [181]:
from sklearn.naive_bayes import MultinomialNB

In [182]:
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions_NB = clf.predict_proba(xvalid_tfv)

In [183]:
y_predict_NB = get_y_predict(predictions_NB)

In [184]:
print("测试集准确率：{}".format(accuracy_score(y_actual,y_predict_NB)))

测试集准确率：0.9223524006432345


### 使用CountVectorizer和朴素贝叶斯

In [185]:
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions_NB = clf.predict_proba(xvalid_ctv)

In [186]:
y_predict_NB = get_y_predict(predictions_NB)

In [187]:
print("测试集准确率：{}".format(accuracy_score(y_actual,y_predict_NB)))

测试集准确率：0.9249942568343671


### 使用CountVectorizer和SVM

在使用SVM之前先进行数据的标准化

In [195]:
from sklearn.decomposition import TruncatedSVD 
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.svm import SVC

In [190]:
#使用SVD进行降维，components设为120，对于SVM来说，SVD的components的合适调整区间一般为120~200 
svd = TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

In [192]:
#对从SVD获得的数据进行缩放
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [196]:
# 调用下SVM模型
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

In [197]:
y_predict_SVM = get_y_predict(predictions)

In [198]:
print("测试集准确率：{}".format(accuracy_score(y_actual,y_predict_SVM)))

测试集准确率：0.9406156673558466


SVM的效果也还算可以