## Download dataset

In [1]:
# !pip3 install kaggle
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle competitions download -c 'fake-news'
# !mv fake-news.zip ../tmp/
# !unzip ../tmp/fake-news.zip -d ../tmp/fake-news
# !rm ../tmp/fake-news.zip

## Data preprocessing

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch as t
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import *

import nltk
import re
from nltk.corpus import stopwords

In [3]:
def optimal_device():
    if t.cuda.is_available():
        return t.device('cuda')
    else:
        try:
            return t.device('mps')
        except:
            return t.device('cpu')
        
# device = optimal_device()
# print(f"Device: {device}")

In [4]:
data_dir = "../tmp/fake-news/"
df = pd.read_csv(f'{data_dir}/train.csv')
test = pd.read_csv(f'{data_dir}/test.csv')

df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
df = df.fillna('')
test = test.fillna('')

df['total'] = df['title']+' '+df['author']
test['total']=test['title']+' '+test['author']

In [6]:
X = df.drop('label',axis=1)
y=df['label']
print(X.shape)
print(y.shape)

(20800, 5)
(20800,)


In [7]:
#Choosing vocabulary size to be 5000 and copying data to msg for further cleaning
voc_size = 5000
msg = X.copy()
msg_test = test.copy()

In [8]:
#Downloading stopwords 
#Stopwords are the words in any language which does not add much meaning to a sentence.
#They can safely be ignored without sacrificing the meaning of the sentence.
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sdfedorov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
#We will be using Stemming here
#Stemming map words to their root forms
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [10]:
#Applying stemming and some preprocessing
corpus = []
for i in range(len(msg)):
  review = re.sub('[^a-zA-Z]',' ',msg['total'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

In [11]:
#Applying stemming and some preprocessing for test data
corpus_test = []
for i in range(len(msg_test)):
  review = re.sub('[^a-zA-Z]',' ',msg_test['total'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus_test.append(review)

In [12]:
from typing import Union, Iterable
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")

tokens = [tokenizer(doc) for doc in corpus]
tokens_test = [tokenizer(doc) for doc in corpus_test]
voc = build_vocab_from_iterator(tokens, max_tokens=voc_size, specials=["<unk>"])
voc.set_default_index(voc["<unk>"])

In [13]:
def one_hot_tokens(voc, tokens):
    voc_tokens = [t.tensor(voc(token), dtype=t.int64) for token in tokens]
    return [F.one_hot(t, num_classes = len(voc)) for t in voc_tokens]

In [14]:
one_hot = one_hot_tokens(voc, tokens)
one_hot_test = one_hot_tokens(voc, tokens_test)

In [18]:
from tqdm.auto import tqdm

max_len = 25

def padding_tensor(one_hot_t):
    embedding = []
    for i in tqdm(range(len(one_hot_t))):
        embedding.append(nn.ConstantPad2d((0, 0, max_len - one_hot_t[i].shape[0], 0), 0)(one_hot_t[i]))
    return t.stack(embedding)
    
embedded_docs = padding_tensor(one_hot)
embedded_docs_test = padding_tensor(one_hot_test)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 20800/20800 [00:07<00:00, 2756.19it/s]
100%|██████████| 5200/5200 [00:01<00:00, 3812.74it/s]


## Model description + training

In [28]:
# model = Sequential()
# model.add(Embedding(voc_size,40,input_length=25))
# model.add(Dropout(0.3))
# model.add(LSTM(100))
# model.add(Dropout(0.3))
# model.add(Dense(64,activation='relu'))
# model.add(Dropout(0.3))
# model.add(Dense(1,activation='sigmoid'))
# model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
# print(model.summary())

class FakeNewsClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.dropout_1 = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dense = nn.Linear(hidden_dim, 64)
        self.dropout_3 = nn.Dropout(dropout)
        self.out = nn.Linear(64, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout_1(x)
        x, _ = self.lstm(x)
        x = self.dropout_2(x)
        x = t.relu(self.dense(x))
        x = self.dropout_3(x)
        x = t.sigmoid(self.out(x))
        return x

model = FakeNewsClassifier(voc_size, 40, 100, 2, 0.3)
model

FakeNewsClassifier(
  (embedding): Embedding(5000, 40, padding_idx=0)
  (dropout_1): Dropout(p=0.3, inplace=False)
  (lstm): LSTM(40, 100, num_layers=2, dropout=0.3)
  (dropout_2): Dropout(p=0.3, inplace=False)
  (dense): Linear(in_features=100, out_features=64, bias=True)
  (dropout_3): Dropout(p=0.3, inplace=False)
  (out): Linear(in_features=64, out_features=1, bias=True)
)

In [24]:
class OneHotDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [31]:
# setup
batch_size = 64
train_dataset = OneHotDataset(embedded_docs, y)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

loss = nn.BCELoss()

In [32]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

AttributeError: 'Adam' object has no attribute 'Adam'