## Download dataset

In [153]:
!pip3 install kaggle
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c 'fake-news'
!mv fake-news.zip ../tmp/
!unzip ../tmp/fake-news.zip -d ../tmp/fake-news
!rm ../tmp/fake-news.zip


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Downloading fake-news.zip to /Users/sdfedorov/Documents/ITMO/Master/2 semester/Big_Data_Infrastructure_ITMO_2023/Lab1/notebooks
 97%|████████████████████████████████████▊ | 45.0M/46.5M [00:06<00:00, 10.6MB/s]
100%|██████████████████████████████████████| 46.5M/46.5M [00:06<00:00, 7.84MB/s]
Archive:  ../tmp/fake-news.zip
  inflating: ../tmp/fake-news/submit.csv  
  inflating: ../tmp/fake-news/test.csv  
  inflating: ../tmp/fake-news/train.csv  


## Data preprocessing

In [7]:
import pandas as pd

import torch as t
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

import nltk
import re
from nltk.corpus import stopwords

In [8]:
def optimal_device():
    if t.cuda.is_available():
        return t.device('cuda')
    else:
        try:
            return t.device('mps')
        except:
            return t.device('cpu')

In [9]:
data_dir = "../tmp/fake-news/"
df = pd.read_csv(f'{data_dir}/train.csv')
test = pd.read_csv(f'{data_dir}/test.csv')

df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [12]:
df = df.fillna('')
test = test.fillna('')

df['total'] = df['title']+' '+df['author']
test['total']=test['title']+' '+test['author']

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.drop('label', axis=1), df['label'], test_size=0.2, random_state=42)

In [19]:
#Choosing vocabulary size to be 5000 and copying data to msg for further cleaning
voc_size = 5000
X_train = X_train.copy()
X_val = X_val.copy()
X_test = test.copy()

In [20]:
#Downloading stopwords 
#Stopwords are the words in any language which does not add much meaning to a sentence.
#They can safely be ignored without sacrificing the meaning of the sentence.
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sdfedorov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
#We will be using Stemming here
#Stemming map words to their root forms
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [148]:
import time

def measure_time(callable, name):
    def wrapper():
        start = time.time()
        result = callable()
        end = time.time()
        print(f"Time elapsed ({name}): {end - start}")
        return result

    return wrapper

def construct_corpus(data):
    corpus = []
    for i in tqdm(range(len(data))):
        review = re.sub('[^a-zA-Z]',' ', data['total'].iloc[i])
        review = review.lower()
        review = review.split()
        review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [149]:
from typing import Union, Iterable
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")

corpus = construct_corpus(X_train)
corpus_test = construct_corpus(X_test)
corpus_val = construct_corpus(X_val)

tokens = [tokenizer(doc) for doc in corpus]
tokens_val = [tokenizer(doc) for doc in corpus_val]
tokens_test = [tokenizer(doc) for doc in corpus_test]
voc = build_vocab_from_iterator(tokens + tokens_val, max_tokens=voc_size, specials=["<unk>"])
voc.set_default_index(voc["<unk>"])

voc_tokens = [t.tensor(voc(token), dtype=t.int64) for token in tokens]
voc_tokens_val = [t.tensor(voc(token), dtype=t.int64) for token in tokens_val]
voc_tokens_test = [t.tensor(voc(token), dtype=t.int64) for token in tokens_test]

  0%|          | 0/16640 [00:00<?, ?it/s]

Review len 18
Review len 20
Review len 18
Review len 20
Review len 15
Review len 16
Review len 7
Review len 6
Review len 12
Review len 19
Review len 9
Review len 16
Review len 15
Review len 0
Review len 11
Review len 12
Review len 15
Review len 17
Review len 8
Review len 17
Review len 11
Review len 14
Review len 9
Review len 11
Review len 0
Review len 15
Review len 14
Review len 23
Review len 15
Review len 17
Review len 11
Review len 19
Review len 20
Review len 21
Review len 15
Review len 15
Review len 20
Review len 8
Review len 21
Review len 9
Review len 11
Review len 17
Review len 15
Review len 12
Review len 10
Review len 16
Review len 12
Review len 14
Review len 13
Review len 16
Review len 13
Review len 15
Review len 16
Review len 16
Review len 25
Review len 21
Review len 9
Review len 19
Review len 16
Review len 17
Review len 15
Review len 17
Review len 20
Review len 17
Review len 11
Review len 16
Review len 23
Review len 9
Review len 19
Review len 18
Review len 21
Review len 23
Rev

KeyboardInterrupt: 

In [None]:
# def one_hot_tokens(voc, tokens):
#     voc_tokens = [t.tensor(voc(token), dtype=t.int64) for token in tokens]
#     return [F.one_hot(t, num_classes = len(voc)) for t in voc_tokens]
# one_hot = one_hot_tokens(voc, tokens)
# one_hot_test = one_hot_tokens(voc, tokens_test)

In [134]:
from tqdm.auto import tqdm

max_len = 25
#
# def padding_tensor(one_hot_t):
#     embedding = []
#     for i in tqdm(range(len(one_hot_t))):
#         embedding.append(nn.ConstantPad2d((0, 0, max_len - one_hot_t[i].shape[0], 0), 0)(one_hot_t[i]))
#     return t.stack(embedding)

def padding_indexes(tokens):
    embedding = []
    for token in tqdm(tokens):
        embedding.append(nn.ConstantPad1d((max_len - len(token), 0), 0)(t.tensor(token, dtype=t.int64)))
    return t.stack(embedding)
    
# embedded_docs = padding_tensor(one_hot)
# embedded_docs_test = padding_tensor(one_hot_test)

padded_tokens = padding_indexes(voc_tokens)
padded_tokens_val = padding_indexes(voc_tokens_val)
padded_tokens_test = padding_indexes(voc_tokens_test)

  0%|          | 0/16640 [00:00<?, ?it/s]

  embedding.append(nn.ConstantPad1d((max_len - len(token), 0), 0)(t.tensor(token, dtype=t.int64)))


  0%|          | 0/4160 [00:00<?, ?it/s]

  0%|          | 0/5200 [00:00<?, ?it/s]

## Model description + training

In [50]:
class FakeNewsClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)#, padding_idx=0)
        self.dropout_1 = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, batch_first=True)
        self.dropout_2 = nn.Dropout(dropout)
        self.dense = nn.Linear(hidden_dim, 64)
        self.dropout_3 = nn.Dropout(dropout)
        self.out = nn.Linear(64, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout_1(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.dropout_2(x)
        x = t.relu(self.dense(x))
        x = self.dropout_3(x)
        x = t.sigmoid(self.out(x))
        return x

In [34]:
class TokensDataset(Dataset):
    def __init__(self, X, y, device):
        self.X = X.to(device)
        self.y = t.tensor(y).float().to(device)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [35]:
device = optimal_device()
print(f"Device: {device}")

Device: mps


In [86]:
# setup
batch_size = 64
train_dataset = TokensDataset(padded_tokens, y_train.values, device)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TokensDataset(padded_tokens_val, y_val.values, device)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

loss = nn.BCELoss()
model = FakeNewsClassifier(voc_size, 40, 100, 2, 0.3).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [87]:
def accuracy(model, test_dataloader):
    model.eval()
    with t.no_grad():
        sum_acc = 0
        for batch in test_dataloader:
            X, y = batch
            y_pred = model(X)
            y_pred = y_pred.squeeze()
            sum_acc += t.sum((y_pred > 0.5) == y)
        return sum_acc / (len(test_dataloader) * test_dataloader.batch_size)

def train(model, dataloader, test_dataloader, loss, optimizer, epochs):
    print(f"Staring acc_train: {accuracy(model, dataloader)} acc_val: {accuracy(model, test_dataloader)}")

    for epoch in range(epochs):
        model.train()
        l = 0
        for batch in dataloader:
            X, y = batch
            optimizer.zero_grad()
            y_pred = model(X)
            y_pred = y_pred.squeeze()
            l = loss(y_pred, y)
            l.backward()
            optimizer.step()

        train_acc = accuracy(model, dataloader)
        val_acc = accuracy(model, test_dataloader)

        print(f"Epoch {epoch+1} loss: {l.item()} acc_train: {train_acc} acc_val: {val_acc}")

In [88]:
train(model, train_dataloader, val_dataloader, loss, optimizer, 5)

Staring acc_train: 0.50390625 acc_val: 0.48750001192092896
Epoch 1 loss: 0.19220766425132751 acc_train: 0.9737379550933838 acc_val: 0.96875
Epoch 2 loss: 0.0719328299164772 acc_train: 0.9912259578704834 acc_val: 0.9872596263885498
Epoch 3 loss: 0.006107015535235405 acc_train: 0.9958533644676208 acc_val: 0.9908654093742371
Epoch 4 loss: 0.009487172588706017 acc_train: 0.9964542984962463 acc_val: 0.9911057949066162
Epoch 5 loss: 0.008560552261769772 acc_train: 0.997776448726654 acc_val: 0.9923076629638672


In [106]:
X_val.iloc[123], y_val.iloc[123]

(id                                                    11547
 title     Aussie Muslims Demand ’Safe Spaces’ so Followe...
 author                                           Simon Kent
 text      An Australian Islamic group wants   “safe spac...
 total     Aussie Muslims Demand ’Safe Spaces’ so Followe...
 Name: 11547, dtype: object,
 0)

In [108]:
model.eval()
model(padded_tokens_val[123].unsqueeze(0).to(device))

tensor([[0.0062]], device='mps:0', grad_fn=<SigmoidBackward0>)

In [131]:
model.eval()

# save model

model_path = "../tmp/model.pt"
t.save(model.state_dict(), model_path)

In [154]:
padded_tokens_val[123].unsqueeze(0).shape

torch.Size([1, 25])

In [None]:
model(padded_tokens_test[])