In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import punkt
from sklearn.feature_extraction.text import CountVectorizer
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix


In [2]:
def get_subject(data):
    data["subject"].value_counts()

In [3]:
def pure_dataset(data):
    '''
    数据清洗
    :param data:
    :return:
    '''
    new_data = []
    pattern = "[^a-zA-Z]"
    lemma = nltk.WordNetLemmatizer()
    for txt in data:
        txt = re.sub(pattern, " ", txt)
        txt = txt.lower() #大小写不做区分，同一个单词
        txt = nltk.word_tokenize(txt)
        txt = [lemma.lemmatize(word) for word in txt]
        for t in txt:
            if len(t) < 4:
                txt.remove(t)
        txt = " ".join(txt)
        new_data.append(txt)
    return new_data

In [4]:
class news_classification(nn.Module):
    def __init__(self):
        super(news_classification, self).__init__()
        self.linear1 = nn.Linear(5008, 2000)
        self.relu1 = nn.ReLU() #激活函数

        self.linear2 = nn.Linear(2000, 500)
        self.relu2 = nn.ReLU()

        self.linear3 = nn.Linear(500, 100)
        self.relu3 = nn.ReLU()

        self.linear4 = nn.Linear(100, 20)
        self.relu4 = nn.ReLU()

        self.linear5 = nn.Linear(20, 2)

    def forward(self, x):
        out = self.linear1(x)
        out = self.relu1(out)
        out = self.linear2(out)
        out = self.relu2(out)
        out = self.linear3(out)
        out = self.relu3(out)
        out = self.linear4(out)
        out = self.relu4(out)
        out = self.linear5(out)
        return out

In [5]:
'''
data set from kaggle, true & false
'''
true_data = pd.read_csv("archive/True.csv")
fake_data = pd.read_csv("archive/Fake.csv")


In [6]:
'''
为数据集添加标签
'''
true_data["label"] = np.ones(len(true_data),dtype=int)
fake_data["label"] = np.zeros(len(fake_data), dtype=int)


In [7]:
'''
数据合并
'''
data = pd.concat((true_data,fake_data),axis=0)
print("--------start puring data--------")
new_text = pure_dataset(data.text)
new_title = pure_dataset(data.title)
print("--------data pured!--------")
data = data.sample(frac=1)


--------start puring data--------
--------data pured!--------


In [8]:
data = pd.get_dummies(data,columns=["subject"])
data.head()

Unnamed: 0,title,text,date,label,subject_Government News,subject_Middle-east,subject_News,subject_US_News,subject_left-news,subject_politics,subject_politicsNews,subject_worldnews
14481,Greeks march to mark 1973 student revolt again...,ATHENS (Reuters) - Greek police clashed with h...,"November 17, 2017",1,0,0,0,0,0,0,0,1
14177,"A $50,000 FREE-FOR-ALL FOR MINORITIES In The B...",PIGFORD has to be the biggest scam and fraud E...,"Apr 3, 2016",0,0,0,0,0,0,1,0,0
2935,Protesters Just Trolled Trump’s ‘Deploraball’...,Trump supporters sure were in for a surprise a...,"January 20, 2017",0,0,0,1,0,0,0,0,0
20347,RINO MITCH McCONNELL Praises Hillary Day After...,Is there a law against waterboarding Republica...,"Jun 30, 2016",0,0,0,0,0,1,0,0,0
15255,Syria's Eastern Ghouta faces 'complete catastr...,"GENEVA (Reuters) - The 400,000 civilians besie...","November 9, 2017",1,0,0,0,0,0,0,0,1


In [9]:
data = data.drop("date",axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 14481 to 13893
Data columns (total 11 columns):
title                      44898 non-null object
text                       44898 non-null object
label                      44898 non-null int64
subject_Government News    44898 non-null uint8
subject_Middle-east        44898 non-null uint8
subject_News               44898 non-null uint8
subject_US_News            44898 non-null uint8
subject_left-news          44898 non-null uint8
subject_politics           44898 non-null uint8
subject_politicsNews       44898 non-null uint8
subject_worldnews          44898 non-null uint8
dtypes: int64(1), object(2), uint8(8)
memory usage: 1.7+ MB


In [10]:
'''
将数据转化为矩阵的形式
并
'''
print("--------vetorizing data--------")
vectorizer_title = CountVectorizer(stop_words="english",max_features=1000)
vectorizer_text = CountVectorizer(stop_words="english",max_features=4000)

title_matrix = vectorizer_title.fit_transform(new_title).toarray()
text_matrix = vectorizer_text.fit_transform(new_text).toarray()


--------vetorizing data--------


In [11]:
data.head()

Unnamed: 0,title,text,label,subject_Government News,subject_Middle-east,subject_News,subject_US_News,subject_left-news,subject_politics,subject_politicsNews,subject_worldnews
14481,Greeks march to mark 1973 student revolt again...,ATHENS (Reuters) - Greek police clashed with h...,1,0,0,0,0,0,0,0,1
14177,"A $50,000 FREE-FOR-ALL FOR MINORITIES In The B...",PIGFORD has to be the biggest scam and fraud E...,0,0,0,0,0,0,1,0,0
2935,Protesters Just Trolled Trump’s ‘Deploraball’...,Trump supporters sure were in for a surprise a...,0,0,0,1,0,0,0,0,0
20347,RINO MITCH McCONNELL Praises Hillary Day After...,Is there a law against waterboarding Republica...,0,0,0,0,0,1,0,0,0
15255,Syria's Eastern Ghouta faces 'complete catastr...,"GENEVA (Reuters) - The 400,000 civilians besie...",1,0,0,0,0,0,0,0,1


In [12]:
data.drop(["title","text"],axis=1,inplace=True)
y = data.label
x = np.concatenate((np.array(data.drop("label",axis=1)),title_matrix,text_matrix),axis=1)
X_train,X_test,Y_train,Y_test = train_test_split(x,np.array(y),test_size=0.25,random_state=1)
# train test分别为训练集和检测集
print("--------train set split finished!--------")
data.info()

--------train set split finished!--------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 14481 to 13893
Data columns (total 9 columns):
label                      44898 non-null int64
subject_Government News    44898 non-null uint8
subject_Middle-east        44898 non-null uint8
subject_News               44898 non-null uint8
subject_US_News            44898 non-null uint8
subject_left-news          44898 non-null uint8
subject_politics           44898 non-null uint8
subject_politicsNews       44898 non-null uint8
subject_worldnews          44898 non-null uint8
dtypes: int64(1), uint8(8)
memory usage: 1.0 MB


In [13]:
'''
建立新闻分类模型
'''
model = news_classification()
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)
error = nn.CrossEntropyLoss()


In [None]:

'''
模型训练
'''
X_train = torch.Tensor(X_train)
Y_train = torch.LongTensor(Y_train)

X_test = torch.Tensor(X_test)
Y_test = torch.Tensor(Y_test)

loss_list = []
rate_list = []

epoch = 30 #迭代20次
print("--------start training--------")
for e in range(epoch):
    optimizer.zero_grad() #清空梯度
    fout = model(X_train) #foward prop
    loss = error(fout, Y_train) #evaluate loss
    loss.backward() #backward prop
    optimizer.step() #update param
    print("epoch {}: loss {}".format(e, loss))
    loss_list.append(loss)
    # prediction and test
    y_head = model(X_test)
    y_pred = torch.max(y_head, 1)[1]
    ascore = accuracy_score(y_pred,Y_test)
    print("rate of good prediction: ", ascore)
    rate_list.append(ascore)
print("--------end of training--------")

edata = [i for i in range(e+1)]
plt.plot(edata,loss_list,color='red',linewidth=2.0,linestyle='--')
plt.plot(edata,rate_list,color='blue',linewidth=3.0,linestyle='-.')
plt.legend(labels=['Loss','Accuracy'])
plt.show()

--------start training--------
epoch 0: loss 0.6935341954231262
rate of good prediction:  0.4710913140311804
epoch 1: loss 11.03582763671875


In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
confusion_matrix = confusion_matrix(y_pred=y_pred,y_true=Y_test)

fig,ax = plt.subplots(figsize=(6,6))
sns.heatmap(confusion_matrix,annot=True,fmt="0.1f",linewidths=1.5)
plt.show()