# Playing with real & fake news data from Kaggle

- 承接[newsExplore](../DataExploring/newsExplore.ipynb)中内容
- 进行一部分特征工程，依赖其结果尝试一些简单模型
- 使用GRU与BERT，根据新闻正文内容进行预测

## 特征提取

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-and-real-news-dataset/True.csv
/kaggle/input/fake-and-real-news-dataset/Fake.csv


In [2]:
real_news = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
fake_news = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

### 移除Real News特有的，指名新闻消息来源的前置文本。
- 如：`WASHINGTON (Reuters) - `

In [3]:
import re 
def removePrefix(text):
    pattern = r"^([A-Z]).*?-\s"
    text = re.sub(pattern, '', text)
    return text

real_news.text = real_news.text.apply(lambda x : removePrefix(x))
real_news.text[1]

'Transgender people will be allowed for the first time to enlist in the U.S. military starting on Monday as ordered by federal courts, the Pentagon said on Friday, after President Donald Trump’s administration decided not to appeal rulings that blocked his transgender ban. Two federal appeals courts, one in Washington and one in Virginia, last week rejected the administration’s request to put on hold orders by lower court judges requiring the military to begin accepting transgender recruits on Jan. 1. A Justice Department official said the administration will not challenge those rulings. “The Department of Defense has announced that it will be releasing an independent study of these issues in the coming weeks. So rather than litigate this interim appeal before that occurs, the administration has decided to wait for DOD’s study and will continue to defend the president’s lawful authority in District Court in the meantime,” the official said, speaking on condition of anonymity. In Septem

### 虽然info中给出数据各列都未非空，实际上有空文本存在（仅含空格），对其进行清理
- 同时还要注意清理重复新闻
- 注意清理后真假新闻两类数目的不平衡会进一步扩大

In [4]:
real_news['valid'] = 1
fake_news['valid'] = 0

news_source = pd.concat([real_news, fake_news], axis=0)
news_source = news_source[news_source[['text', 'title', 'date']].duplicated() == False]
news_source.text = news_source.text.apply(lambda x: np.nan if len(x.strip()) < 1 else x)
news_source = news_source.dropna()
news_source.drop(columns=['subject'], inplace=True)
news_source.duplicated().sum()

0

In [5]:
news_source.to_csv('./Source.csv')

In [6]:
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
import nltk
import string
import pandas as pd

news_source = pd.read_csv('./Source.csv')
stop_words = set(stopwords.words('english'))
punctuations = set(string.punctuation)

### 将两类新闻中前十的高频词取出，由之前的分析可看出，两类新闻有着不同的高频话题倾向
- 故将各条新闻中所含的，此两类高频词的数目作为特征提出

In [7]:
def token_freq(df, feature, valid):
    tar_texts = df[df.valid == valid][feature].values
    texts = ' '.join(tar_texts).lower()
    tokens = ''.join(char for char in texts if char not in punctuations).split()
    tokens_cleaned = [word for word in tokens if word not in stop_words]
    return pd.DataFrame(nltk.FreqDist(tokens_cleaned).most_common(10))[0]

real_title_freq = token_freq(news_source, 'title', 1)
real_text_freq = token_freq(news_source, 'text', 1)
fake_title_freq = token_freq(news_source, 'title', 0)
fake_text_freq = token_freq(news_source, 'text', 0)

In [8]:
def count_freq_token(text, freq_df):
    text = text.lower()
    tokens = ''.join(char for char in text if char not in punctuations).split()
    count = 0
    for token in tokens:
        if token in freq_df.values:
            count += 1
    return count

In [9]:
news_source['fake_title_token_freq'] = news_source.title.apply(lambda x : count_freq_token(x, fake_title_freq))
news_source['read_title_token_freq'] = news_source.title.apply(lambda x : count_freq_token(x, real_title_freq))
news_source['fake_text_token_freq'] = news_source.title.apply(lambda x : count_freq_token(x, fake_text_freq))
news_source['read_text_token_freq'] = news_source.title.apply(lambda x : count_freq_token(x, real_text_freq))

### 已知假新闻中会使用大量带有感情色彩的符号来增强语气，故将其数目作为特征提出。

In [10]:
def countPunctuation(text):
    ques = re.subn(r"\?", "", text)[1]
    exclam = re.subn(r"\!", "", text)[1]
    return ques, exclam

In [11]:
news_source[['title_ques_num', 'title_exclam_num']] = news_source.title.apply(lambda x : pd.Series(countPunctuation(x)))
news_source[['text_ques_num', 'text_exclam_num']] = news_source.text.apply(lambda x : pd.Series(countPunctuation(x)))

### 假新闻和真新闻在平均长度上是有区别的，故将几个长度作为特征去除

In [12]:
news_source['title_len'] = news_source.title.apply(lambda x : len(x))
news_source['title_ratio'] = news_source.text.apply(lambda x : len(x))
news_source['title_ratio'] = news_source['title_len'] / news_source['title_ratio']

In [13]:
news_source.to_csv('./NewsAna.csv')

In [14]:
news_source.corr().valid

Unnamed: 0               0.120031
valid                    1.000000
fake_title_token_freq   -0.473264
read_title_token_freq    0.194594
fake_text_token_freq    -0.061342
read_text_token_freq     0.064952
title_ques_num          -0.144762
title_exclam_num        -0.238136
text_ques_num           -0.324997
text_exclam_num         -0.237945
title_len               -0.595802
title_ratio             -0.066529
Name: valid, dtype: float64

## 随机打乱，开始利用简单模型建模拟合

In [15]:
import pandas as pd
news_source = pd.read_csv('./NewsAna.csv')
news_source.drop(columns=['Unnamed: 0', 'title', 'text', 'date'], inplace=True)

In [16]:
news_source.reset_index()
news_source = news_source.sample(frac=1.)

In [17]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

train_set, test_set = train_test_split(news_source, test_size=0.2, random_state=7)
train_x = train_set[['fake_title_token_freq', 'read_title_token_freq', 'fake_text_token_freq', 
                 'read_text_token_freq', 'title_ques_num', 'title_exclam_num', 'text_ques_num', 
                 'text_exclam_num', 'title_len', 'title_ratio']]
train_y = train_set['valid']
test_x = test_set[['fake_title_token_freq', 'read_title_token_freq', 'fake_text_token_freq', 
                 'read_text_token_freq', 'title_ques_num', 'title_exclam_num', 'text_ques_num', 
                 'text_exclam_num', 'title_len', 'title_ratio']]
test_y = test_set['valid']

In [18]:
def model_report(model, tar_x, tar_y):
    pred = model.predict(tar_x)
    f1 = f1_score(tar_y, pred)
    print("f1-score: ", f1)
    acc = accuracy_score(tar_y, pred)
    print("accuracy: ", acc)
    cm = confusion_matrix(tar_y, pred)
    print("confusion matrix:\n",cm)

In [19]:
randomForest = RandomForestClassifier(random_state=7)
randomForest.fit(train_x, train_y)

RandomForestClassifier(random_state=7)

In [20]:
print("RF on training set:")
model_report(randomForest, train_x, train_y)
print("\nRF on testing set:")
model_report(randomForest, test_x, test_y)

RF on training set:
f1-score:  0.9993524459881086
accuracy:  0.999288693459213
confusion matrix:
 [[13931    21]
 [    1 16976]]

RF on testing set:
f1-score:  0.926496530636246
accuracy:  0.9191775507564981
confusion matrix:
 [[3169  343]
 [ 282 3939]]


In [21]:
from sklearn.tree import DecisionTreeClassifier

decisionTree = DecisionTreeClassifier()
decisionTree.fit(train_x, train_y)

DecisionTreeClassifier()

In [22]:
print("tree on training set:")
model_report(decisionTree, train_x, train_y)
print("\ntree on testing set:")
model_report(decisionTree, test_x, test_y)

tree on training set:
f1-score:  0.9993517208863745
accuracy:  0.999288693459213
confusion matrix:
 [[13950     2]
 [   20 16957]]

tree on testing set:
f1-score:  0.8997046662728885
accuracy:  0.8902107849476271
confusion matrix:
 [[3076  436]
 [ 413 3808]]


In [23]:
from sklearn.svm import LinearSVC

linearSVC = LinearSVC(max_iter=5000, penalty='l2')
linearSVC.fit(train_x, train_y)



LinearSVC(max_iter=5000)

In [24]:
print("LinearSVC on training set:")
model_report(linearSVC, train_x, train_y)
print("\nLinearSVC on testing set:")
model_report(linearSVC, test_x, test_y)

LinearSVC on training set:
f1-score:  0.9178554141396885
accuracy:  0.9067218468104368
confusion matrix:
 [[11926  2026]
 [  859 16118]]

LinearSVC on testing set:
f1-score:  0.9197431781701444
accuracy:  0.9094788568472779
confusion matrix:
 [[3022  490]
 [ 210 4011]]


- 可以看出，通过数据分析提取出的部分特征，已经支持简单模型在测试集上达到90%左右的准确率和F1得分
- 可见真假新闻还是有相当多差异的，通过经典的NLP特征工程来分辨它们也能有不错的效果
- 加入更多Info Retrieve与NLP的特征应当可以进一步增强表现，如利用tf-idf值来提取标题关键词，加入词干词根提取工程等方式

## 利用预训练BERT进行真假判定

In [25]:
from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler
import torch

batch_size = 64
epoch_num = 4
max_seq_length = 128

news_source = pd.read_csv('./NewsAna.csv')
news_source.reset_index()
news_source = news_source.sample(frac=1.)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model_path = "bert-base-uncased"

### 定义模型，将输出维度调整到最终判定需要的维度

In [26]:
class NewsModel(torch.nn.Module):
    def __init__(self, bert_model=bert_model_path, num_class=1, add_feature_num=0):
        super(NewsModel, self).__init__()
        # 加载预训练模型(from huggingface)
        self.bert_layer = AutoModel.from_pretrained(pretrained_model_name_or_path=bert_model)
        # 或许可以补入一点特征数据，来求取最终结果
        self.bert_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=bert_model)
        self.mid_dim = self.bert_config.hidden_size + add_feature_num
        # 进行最终分类
        self.output = torch.nn.Sequential(
            torch.nn.Linear(self.mid_dim, self.mid_dim//2),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(self.mid_dim//2, self.mid_dim),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(self.mid_dim, num_class),
            torch.nn.Sigmoid()
        )
    def forward(self, input_ids, attn_mask=None, add_features=[]):
        bert_out = self.bert_layer(input_ids=input_ids, attention_mask=attn_mask)[1]
        # 加入特征也应当为：[batch_size, add_feature_num]的形状
        if len(add_features) > 0:
            add_features = torch.tensor(add_features, dtype=torch.float)
            if add_features.shape[0] == bert_out.shape[0]:
                bert_out = torch.cat(bert_out, add_features)
        output = self.output(bert_out)
        return output

### 以Tensor的格式给出tokenize后的文本
- huggingface的tokenizer工作方式可以[看看这个](./HuggingfaceNote.ipynb)
- 以Tensor格式返回的好处就是能直接塞进TensorDataset中了，不需要再继承一个Dataset类

In [27]:
def covertTokenFormat(df, bert_model_path, max_seq_len):
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=bert_model_path)
    titles = df.text.tolist()
    title_tokens = tokenizer(titles, padding='max_length', max_length=max_seq_len, truncation=True, return_tensors="pt")
    labels = torch.tensor(df.valid.values, dtype=torch.float)
    return title_tokens, labels

In [28]:
news_model = NewsModel(bert_model=bert_model_path).to(device)
optimiser = AdamW(news_model.parameters(), lr=1e-5)
train_set, test_set = train_test_split(news_source, test_size=0.2, random_state=7)

text_tokens, labels = covertTokenFormat(train_set, bert_model_path, max_seq_length)
train_data = TensorDataset(text_tokens.input_ids, text_tokens.attention_mask, labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [29]:
def news_bert_report(pred, label):
    tar_y = label.squeeze()
    pred_y = []
    for item in pred.squeeze():
        if item >= 0.5:
            pred_y.append(1)
        else:
            pred_y.append(0)
    f1 = f1_score(tar_y, pred_y)
    print("f1-score: ", f1)
    acc = accuracy_score(tar_y, pred_y)
    print("accuracy: ", acc)

### 进行训练
- 实际上根据在training set和testing set上的表现制定一套早停策略更好
- BERT非常强大，快速的在训练集上逼近了99.95+%

In [30]:
from torch.nn import functional as F
news_model = news_model.to(device)
news_model.train()
for epoch in range(epoch_num): 
    epoch_loss = 0
    pred_lis = torch.Tensor()
    label_lis = torch.Tensor()
    for batch, (token_ids, attn_mask, label) in enumerate(train_dataloader):
        # keep all the parameters in the same device
        token_ids = token_ids.to(device)
        attn_mask = attn_mask.to(device)
        label = label.to(device)
        # the output will be in the same device with the model
        outputs = news_model(token_ids, attn_mask)
        loss = F.binary_cross_entropy(outputs.squeeze(), label)
        # do the backprop and update the parameters
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()
        epoch_loss += loss.cpu().data.numpy()
        pred_lis = torch.cat([pred_lis, outputs.cpu().squeeze()])
        label_lis = torch.cat([label_lis, label.cpu().squeeze()])
        if batch % 50 == 0:
            print("Current batch loss :", loss.cpu().data.numpy())
    print("Now epoch :", epoch+1, " Total epoch loss is: ", epoch_loss)
    news_bert_report(pred_lis.detach().numpy(), label_lis.detach().numpy())

Current batch loss : 0.69488025
Current batch loss : 0.3863423
Current batch loss : 0.19158298
Current batch loss : 0.09455851
Current batch loss : 0.031617053
Current batch loss : 0.109588556
Current batch loss : 0.02393411
Current batch loss : 0.084877014
Current batch loss : 0.022406645
Current batch loss : 0.01850613
Now epoch : 1  Total epoch loss is:  61.19065617257729
f1-score:  0.9732358680140957
accuracy:  0.9702867858643991
Current batch loss : 0.005929833
Current batch loss : 0.008962766
Current batch loss : 0.04420775
Current batch loss : 0.08559545
Current batch loss : 0.053183675
Current batch loss : 0.0019030205
Current batch loss : 0.001381191
Current batch loss : 0.10759225
Current batch loss : 0.0022155712
Current batch loss : 0.0036287848
Now epoch : 2  Total epoch loss is:  5.632405258889776
f1-score:  0.9969665734059786
accuracy:  0.9966697921044974
Current batch loss : 0.00081905833
Current batch loss : 0.08888218
Current batch loss : 0.0011167352
Current batch lo

In [31]:
torch.save(news_model, './news_model_24_03_01.pkl')

### Kaggle的GPU也塞不下了，不过至少测试机集中这一百条数据全部正确。

In [32]:
news_model.eval()
test_tokens, test_labels = covertTokenFormat(test_set[:100], bert_model_path, max_seq_length)
test_ids = test_tokens.input_ids.to(device)
test_masks = test_tokens.attention_mask.to(device)
test_pred = news_model(test_ids, test_masks)
news_bert_report(test_pred.cpu().squeeze().detach().numpy(), test_labels)

torch.cuda.empty_cache()

f1-score:  1.0
accuracy:  1.0


## 利用GLOVE预训练词向量和GRU进行真假新闻判定
- 将会忽略正文中所有stopwords和标点符号
- 利用了golve 6b 50d的预训练词向量

In [33]:
import torch
import string
import torch.nn.utils.rnn as rnn_utils
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from torchtext.vocab import GloVe
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

import pandas as pd
news_source = pd.read_csv('./NewsAna.csv')
news_source.reset_index()
news_source = news_source.sample(frac=1.)

stop_words = set(stopwords.words('english'))
punctuations = set(string.punctuation)
cache_dir = './glove'
glove = GloVe(name='6B', dim=50, cache=cache_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

epoch_num = 30
batch_size = 128
lr = .001
input_dim = 50
output_dim = 1
gru_num_layers = 2

./glove/glove.6B.zip: 862MB [03:30, 4.10MB/s]                               
100%|█████████▉| 399999/400000 [00:14<00:00, 27930.42it/s]


### 定义GRU网络，使用预训练词向量的话，可以略过Embedding层
- 由于处理的是变长的数据，因此利用RNN的pad和unpad操作，捕捉每个输入句子的真正结束位置
- 指定了`batch_first=True`，故数据x进入rnn前不必交换batch（第0维）和sentence_length（第一维）位置

In [34]:
class NewsGRUModel(torch.nn.Module):
    def __init__(self, input_dim, output_dim, vocab_size=0, gru_num_layers=1, bidirectional=False, dropout=.3, hidden_layers = [128, 64, 128]):
        super(NewsGRUModel, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = input_dim // 2 if bidirectional else input_dim
        self.output_dim = output_dim
        self.gru_num_layers = gru_num_layers
        # Embedding
        if not vocab_size == 0:
            self.embed = torch.nn.Embedding(vocab_size, input_dim)
        # GRUs
        self.gru_layer = torch.nn.GRU(
            input_size=self.input_dim, 
            hidden_size=self.hidden_dim, 
            num_layers=self.gru_num_layers, 
            bidirectional=bidirectional, 
            batch_first=True,
            dropout=dropout
        )
        # The FFN to adjust the outputs
        if hidden_layers and not len(hidden_layers) == 0:
            # the dim is not changed through the two GRU layer
            hidden_list = [torch.nn.Linear(self.input_dim, hidden_layers[0])]
            for idx in range(len(hidden_layers) - 1):
                hidden_list.append(torch.nn.Linear(hidden_layers[idx], hidden_layers[idx + 1]))
            self.hidden_layer_list = torch.nn.ModuleList(hidden_list)
            # init the weights
            for layer in self.hidden_layer_list: 
                torch.nn.init.kaiming_normal_(layer.weight.data)
            self.hidden_out_dim = hidden_layers[-1]
        else:
            self.hidden_layer_list = []
            self.hidden_out_dim = self.input_dim
        # Output layer
        self.output = torch.nn.Linear(self.hidden_out_dim, self.output_dim)
        torch.nn.init.kaiming_normal_(self.output.weight.data)
        # Other functions
        self.activate = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(dropout)
    
    def forward(self, x, x_len, pretrained_embed=False):
        if not pretrained_embed:
            x = self.embed(x)
        # pack padded seq
        x = rnn_utils.pack_padded_sequence(x, x_len, batch_first=True, enforce_sorted=False)
        # GRU layer
        output, hidden_info = self.gru_layer(x)
        # pad packed seq
        output, length = rnn_utils.pad_packed_sequence(output, batch_first=True)
        # gather the output from the last unpad token
        fin_cell_outputs = []
        for idx in range(len(length)):
            fin_cell_outputs.append(output[idx][length[idx]-1])
        # stack the output
        output = torch.stack(fin_cell_outputs)
        # ffn process
        for layer in self.hidden_layer_list:
            output = layer(output)
            output = self.activate(output)
            output = self.dropout(output)
        # output layer, get logits
        output = self.output(output)
        return output

In [35]:
# 将句子转化为tokens，用了Glove的预训练词向量
def covertTextToGolveVec(df):
    golve_vecs = []
    titles = df.text.values
    for title in titles:
        tokens = word_tokenize(title.lower())
        for token in tokens:
            if token in stop_words or token in punctuations:
                tokens.remove(token)
        golve_vecs.append(glove.get_vecs_by_tokens(tokens))
    return golve_vecs

### 定义Dataset和collate_fn方法：
- Dataset用于数据存储，我们希望从Dataset中每次取出的批量数据是Tensor格式的，即可以直接放入GPU中进行批处理
- collate_fn方法定义了每次从Dataset中取出一批数据的方法，也就是说，在这里对变长的句子进行pad，并保存每个句子的实际长度
- 故如果能将Dataset中的数据按长度升序或降序排列可以进一步减少计算开销（此处未进行）

In [36]:
class NewsDataset(Dataset):
    def __init__(self, train_x, train_y):
        self.train_x = train_x
        self.train_y = train_y
    def __len__(self):
        return len(self.train_y)
    def __getitem__(self, idx):
        idx -= 1
        return self.train_x[idx], self.train_y[idx]
    
def collate_fn(train_data):
    (train_data, train_label) = zip(*train_data)
    data_length = [len(data) for data in train_data]
    train_data = rnn_utils.pad_sequence(train_data, batch_first=True, padding_value=0)
    train_label = torch.Tensor(train_label)
    return train_data, train_label, data_length

In [37]:
train_set, test_set = train_test_split(news_source, test_size=0.2, random_state=7)
news_vecs = covertTextToGolveVec(train_set)
label_vecs = train_set.valid.values

train_dataset = NewsDataset(news_vecs, label_vecs)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)

news_gru = NewsGRUModel(input_dim=input_dim, output_dim=output_dim, gru_num_layers=gru_num_layers).to(device)
optimiser = torch.optim.Adam(news_gru.parameters(), lr=lr)
loss_func = torch.nn.MSELoss()

In [38]:
def news_gru_report(pred, label):
    tar_y = label.squeeze()
    pred_y = []
    for item in pred.squeeze():
        if item >= 0.5:
            pred_y.append(1)
        else:
            pred_y.append(0)
    f1 = f1_score(tar_y, pred_y)
    print("f1-score: ", f1)
    acc = accuracy_score(tar_y, pred_y)
    print("accuracy: ", acc)
    cm = confusion_matrix(tar_y, pred_y)
    print("confusion matrix:\n",cm)

### 看来这份数据集中的真假新闻差异确实不小，GRU模型各项指标也是迅速潘升到99.8+%
- 两个模型的训练过程是很草率的，想要一个细致可接受的结果的话还要加入更多训练策略
- 不过用BERT的话，要跑K-Fold实在是太吃力了（指资源）

In [39]:
news_gru = news_gru.to(device)
news_gru.train()
for epoch in range(epoch_num):
    epoch_loss = 0
    pred_lis = torch.Tensor()
    label_lis = torch.Tensor()
    for batch_idx, (data, label, length) in enumerate(train_dataloader):
        input_vec = data.to(device)
        label = label.to(device)
        pred = news_gru(input_vec, length, True)
        loss = loss_func(pred.squeeze(), label.squeeze())
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()
        epoch_loss += loss.cpu().data.numpy()
        pred_lis = torch.cat([pred_lis, pred.cpu().squeeze()])
        label_lis = torch.cat([label_lis, label.cpu().squeeze()])
    if (epoch + 1) % 10 == 0 or epoch == 0:
        news_gru_report(pred_lis.detach().numpy(), label_lis.detach().numpy())
        print("Current epoch: ", epoch + 1, " Total loss: ", epoch_loss)

f1-score:  0.8509674887553959
accuracy:  0.8403763458243073
confusion matrix:
 [[11897  2065]
 [ 2872 14095]]
Current epoch:  1  Total loss:  32.24637720733881
f1-score:  0.9941675503711558
accuracy:  0.9935982411329173
confusion matrix:
 [[13856   106]
 [   92 16875]]
Current epoch:  10  Total loss:  2.1509476064238697
f1-score:  0.9964060566782537
accuracy:  0.9960554819101813
confusion matrix:
 [[13895    67]
 [   55 16912]]
Current epoch:  20  Total loss:  1.5841168414335698
f1-score:  0.9982031871336418
accuracy:  0.9980277409550907
confusion matrix:
 [[13924    38]
 [   23 16944]]
Current epoch:  30  Total loss:  1.1582776526920497


In [40]:
news_gru.eval()
news_gru.to('cpu')
test_news = covertTextToGolveVec(test_set[:50])
test_labels = test_set.valid.values[:50]
test_length = [len(data) for data in test_news]
test_data = rnn_utils.pad_sequence(test_news, batch_first=True, padding_value=0)
test_pred = news_gru(test_data, test_length, True)
news_gru_report(test_pred, test_labels)

torch.cuda.empty_cache()

f1-score:  1.0
accuracy:  1.0
confusion matrix:
 [[25  0]
 [ 0 25]]
