In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import punkt
from sklearn.feature_extraction.text import CountVectorizer
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix


In [2]:
def get_subject(data):
    data["subject"].value_counts()

In [3]:
def pure_dataset(data):
    '''
    数据清洗
    :param data:
    :return:
    '''
    new_data = []
    pattern = "[^a-zA-Z]"
    lemma = nltk.WordNetLemmatizer()
    for txt in data:
        txt = re.sub(pattern, " ", txt)
        txt = txt.lower() #大小写不做区分，同一个单词
        txt = nltk.word_tokenize(txt)
        txt = [lemma.lemmatize(word) for word in txt]
        for t in txt:
            if len(t) < 4:
                txt.remove(t)
        txt = " ".join(txt)
        new_data.append(txt)
    return new_data

In [4]:
class news_classification(nn.Module):
    def __init__(self):
        super(news_classification, self).__init__()
        self.linear1 = nn.Linear(5008, 2000)
        self.relu1 = nn.ReLU() #激活函数

        self.linear2 = nn.Linear(2000, 500)
        self.relu2 = nn.ReLU()

        self.linear3 = nn.Linear(500, 100)
        self.relu3 = nn.ReLU()

        self.linear4 = nn.Linear(100, 20)
        self.relu4 = nn.ReLU()

        self.linear5 = nn.Linear(20, 2)

    def forward(self, x):
        out = self.linear1(x)
        out = self.relu1(out)
        out = self.linear2(out)
        out = self.relu2(out)
        out = self.linear3(out)
        out = self.relu3(out)
        out = self.linear4(out)
        out = self.relu4(out)
        out = self.linear5(out)
        return out

In [5]:
'''
data set from kaggle, true & false
'''
true_data = pd.read_csv("archive/True.csv")
fake_data = pd.read_csv("archive/Fake.csv")


In [6]:
'''
为数据集添加标签
'''
true_data["label"] = np.ones(len(true_data),dtype=int)
fake_data["label"] = np.zeros(len(fake_data), dtype=int)


In [7]:
'''
数据合并
'''
data = pd.concat((true_data,fake_data),axis=0)
print("--------start puring data--------")
new_text = pure_dataset(data.text)
new_title = pure_dataset(data.title)
print("--------data pured!--------")
data = data.sample(frac=1)


--------start puring data--------
--------data pured!--------


In [8]:
data = pd.get_dummies(data,columns=["subject"])
data.head()

Unnamed: 0,title,text,date,label,subject_Government News,subject_Middle-east,subject_News,subject_US_News,subject_left-news,subject_politics,subject_politicsNews,subject_worldnews
17298,Russia's Putin signs decree imposing restricti...,MOSCOW (Reuters) - Russian President Vladimir ...,"October 16, 2017",1,0,0,0,0,0,0,0,1
18037,"Philippine president sees biggest ratings dip,...",MANILA (Reuters) - Trust and satisfaction in P...,"October 8, 2017",1,0,0,0,0,0,0,0,1
2004,Trump Staff Hangs Absurdly Offensive Abraham ...,"A few years ago, teenage pop star Justin Biebe...","March 25, 2017",0,0,0,1,0,0,0,0,0
1230,Trump strikes blow at Iran nuclear deal in maj...,WASHINGTON (Reuters) - U.S. President Donald T...,"October 13, 2017",1,0,0,0,0,0,0,1,0
20254,U.S. diplomatic tiff with Russia should not be...,WASHINGTON (Reuters) - U.S. Secretary of State...,"September 12, 2017",1,0,0,0,0,0,0,0,1


In [9]:
data = data.drop("date",axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 17298 to 17972
Data columns (total 11 columns):
title                      44898 non-null object
text                       44898 non-null object
label                      44898 non-null int64
subject_Government News    44898 non-null uint8
subject_Middle-east        44898 non-null uint8
subject_News               44898 non-null uint8
subject_US_News            44898 non-null uint8
subject_left-news          44898 non-null uint8
subject_politics           44898 non-null uint8
subject_politicsNews       44898 non-null uint8
subject_worldnews          44898 non-null uint8
dtypes: int64(1), object(2), uint8(8)
memory usage: 1.7+ MB


In [10]:
'''
将数据转化为矩阵的形式
并
'''
print("--------vetorizing data--------")
vectorizer_title = CountVectorizer(stop_words="english",max_features=1000)
vectorizer_text = CountVectorizer(stop_words="english",max_features=4000)

title_matrix = vectorizer_title.fit_transform(new_title).toarray()
text_matrix = vectorizer_text.fit_transform(new_text).toarray()


--------vetorizing data--------


In [11]:
data.head()

Unnamed: 0,title,text,label,subject_Government News,subject_Middle-east,subject_News,subject_US_News,subject_left-news,subject_politics,subject_politicsNews,subject_worldnews
17298,Russia's Putin signs decree imposing restricti...,MOSCOW (Reuters) - Russian President Vladimir ...,1,0,0,0,0,0,0,0,1
18037,"Philippine president sees biggest ratings dip,...",MANILA (Reuters) - Trust and satisfaction in P...,1,0,0,0,0,0,0,0,1
2004,Trump Staff Hangs Absurdly Offensive Abraham ...,"A few years ago, teenage pop star Justin Biebe...",0,0,0,1,0,0,0,0,0
1230,Trump strikes blow at Iran nuclear deal in maj...,WASHINGTON (Reuters) - U.S. President Donald T...,1,0,0,0,0,0,0,1,0
20254,U.S. diplomatic tiff with Russia should not be...,WASHINGTON (Reuters) - U.S. Secretary of State...,1,0,0,0,0,0,0,0,1


In [12]:
data.drop(["title","text"],axis=1,inplace=True)
y = data.label
x = np.concatenate((np.array(data.drop("label",axis=1)),title_matrix,text_matrix),axis=1)
X_train,X_test,Y_train,Y_test = train_test_split(x,np.array(y),test_size=0.25,random_state=1)
# train test分别为训练集和检测集
print("--------train set split finished!--------")
data.info()

--------train set split finished!--------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 17298 to 17972
Data columns (total 9 columns):
label                      44898 non-null int64
subject_Government News    44898 non-null uint8
subject_Middle-east        44898 non-null uint8
subject_News               44898 non-null uint8
subject_US_News            44898 non-null uint8
subject_left-news          44898 non-null uint8
subject_politics           44898 non-null uint8
subject_politicsNews       44898 non-null uint8
subject_worldnews          44898 non-null uint8
dtypes: int64(1), uint8(8)
memory usage: 1.0 MB


In [13]:
'''
建立新闻分类模型
'''
model = news_classification()
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)
error = nn.CrossEntropyLoss()


In [None]:

'''
模型训练
'''
X_train = torch.Tensor(X_train)
Y_train = torch.LongTensor(Y_train)

X_test = torch.Tensor(X_test)
Y_test = torch.Tensor(Y_test)

loss_list = []
rate_list = []

epoch = 30 #迭代20次
print("--------start training--------")
for e in range(epoch):
    optimizer.zero_grad() #清空梯度
    fout = model(X_train) #foward prop
    loss = error(fout, Y_train) #evaluate loss
    loss.backward() #backward prop
    optimizer.step() #update param
    print("epoch {}: loss {}".format(e, loss))
    loss_list.append(loss)
    # prediction and test
    y_head = model(X_test)
    y_pred = torch.max(y_head, 1)[1]
    ascore = accuracy_score(y_pred,Y_test)
    print("rate of good prediction: ", ascore)
    rate_list.append(ascore)
print("--------end of training--------")

edata = [i for i in range(e+1)]
plt.plot(edata,loss_list,color='red',linewidth=2.0,linestyle='--')
plt.plot(edata,rate_list,color='blue',linewidth=3.0,linestyle='-.')
plt.legend(labels=['Loss','Accuracy'])
plt.show()

--------start training--------
epoch 0: loss 0.4626637101173401
rate of good prediction:  0.7835189309576838
epoch 1: loss 0.24454858899116516
rate of good prediction:  0.7608017817371938
epoch 2: loss 0.28707796335220337
rate of good prediction:  0.7770155902004454
epoch 3: loss 0.24912941455841064
rate of good prediction:  0.8005345211581292
epoch 4: loss 0.22422780096530914
rate of good prediction:  0.7998218262806236
epoch 5: loss 0.2270127683877945
rate of good prediction:  0.8226280623608018
epoch 6: loss 0.16395416855812073
rate of good prediction:  0.8349220489977728
epoch 7: loss 0.16253311932086945
rate of good prediction:  0.8362583518930957
epoch 8: loss 0.1901250034570694
rate of good prediction:  0.8472160356347439
epoch 9: loss 0.15009337663650513
rate of good prediction:  0.8685077951002227
epoch 10: loss 0.08465636521577835
rate of good prediction:  0.8548775055679287
epoch 11: loss 0.1309114545583725
rate of good prediction:  0.8808017817371938
epoch 12: loss 0.069701