submitted by Tarang Ranpara (202011057)

In [None]:
import os
import spacy
import logging
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

spacy_eng = spacy.load("en_core_web_sm")
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Exploring the data

In [None]:
train_path = 'drive/MyDrive/NLP_A5/'

In [None]:
data = data.drop(data[~data.label.isin(['0', '1'])].index)
print("Cleaned Dataset shape:", data.shape)
data.head()

Cleaned Dataset shape: (20798, 5)


Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [None]:
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [None]:
X = data[['id', 'title', 'author', 'text']]
y = data[['label']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# Building the vocab

In [None]:
class Vocab:
    def __init__(self):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {t: i for i, t in self.itos.items()}

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    def build_vocab(self, sentences):
        logging.info("Building vocab")
        idx = 4
        for sent in sentences:
            for word in self.tokenizer_eng(sent):
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1
        
        logging.info("Vocab built.")

    def vectorize(self, text):
        tokenized_text = self.tokenizer_eng(text)
        
        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

# Dataset Loader

In [None]:
class DatasetLoader(Dataset):
    def __init__(self, root_dir, subset="train", transform=None, test_size=0.1):
        self.root_dir = root_dir
        self.transform = transform
        self.test_size = test_size
        self.subset = subset

        # Loading dataset 
        self.df = pd.read_csv(
            os.path.join(root_dir, "train.csv"), 
            error_bad_lines=False, 
            warn_bad_lines=False, 
            engine="python")
        
        # Cleaning dataset 
        self.__clean_data()

        # Splitting the data 
        self.train_data, self.test_data = self.__train_test_split()

        # Get texts and labels
        self.texts = self.train_data["title"].values
        self.labels = self.train_data["label"].values.astype(np.int64)

        self.test_texts = self.test_data["title"].values
        self.test_labels = self.test_data["label"].values.astype(np.int64)

        self.classes = ["Non Fake", "Fake"]

        # Initialize and build vocabulary
        self.vocab = Vocab()
        self.vocab.build_vocab(self.texts.tolist())

    def __len__(self):
        return len(self.texts) if self.subset == "train" else len(self.test_texts)

    def __getitem__(self, index):
        if self.subset == "train":
            text = self.texts[index]
            label = self.labels[index]
        else:
            text = self.test_texts[index]
            label = self.test_labels[index]

        if self.transform is not None:
            text = self.transform(text)

        vectorized_text = [self.vocab.stoi["<SOS>"]]
        vectorized_text += self.vocab.vectorize(text)
        vectorized_text.append(self.vocab.stoi["<EOS>"])

        return vectorized_text, label
    
    def __clean_data(self):
        self.df = self.df.dropna()
        self.df = self.df.drop(self.df[~self.df.label.isin(['0', '1'])].index)
    
    def __train_test_split(self):
        return train_test_split(self.df, test_size=self.test_size, random_state=42)

# pad the sequences 

In [None]:
class PadTextSequence:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        texts = [torch.tensor(item[0]) for item in batch]
        labels = [torch.tensor(item[1]) for item in batch]
        texts = pad_sequence(texts, batch_first=True, padding_value=self.pad_idx)
        
        return texts, torch.Tensor(labels).to(torch.int64)

# get train/test loaders

In [None]:
def get_train_test_loader(
        root_fldr,
        transform=None,
        batch_size=32,
        shuffle=True,
        test_split=0.1
):
    train_dataset = DatasetLoader(root_fldr, subset="train", transform=transform, test_size=test_split)
    test_dataset = DatasetLoader(root_fldr, subset="test", transform=transform, test_size=test_split)
    pad_idx = train_dataset.vocab.stoi["<PAD>"]

    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=PadTextSequence(pad_idx=pad_idx)
    )

    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=PadTextSequence(pad_idx=pad_idx)
    )

    return train_loader, test_loader

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# setting hyper params 

In [None]:
# Set up hyperparameters
num_layers = 1
hidden_nodes = 256
embedding_dim = 300
learning_rate = 0.0001
batch_size = 64
num_epochs = 15

In [None]:
train_loader, test_loader = get_train_test_loader(train_path, batch_size=batch_size, shuffle=True)
vocab_size = len(train_loader.dataset.vocab)

2021-09-15 16:39:37,732 : INFO : NumExpr defaulting to 2 threads.
2021-09-15 16:39:37,755 : INFO : Building vocab
2021-09-15 16:39:40,346 : INFO : Vocab built.
2021-09-15 16:39:42,170 : INFO : Building vocab
2021-09-15 16:39:44,115 : INFO : Vocab built.


# LSTM Model

In [None]:
class LSTM_model(nn.Module):
    def __init__(
        self, 
        inp_size,
        hidden_nodes=64, 
        num_layers=1,
        embedding_dim=100
    ):
        super(LSTM_model, self).__init__()
        self.hidden_nodes = hidden_nodes
        self.num_layers = num_layers

        self.embedding = nn.Embedding(inp_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_nodes, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_nodes, 2)
    
    def forward(self, x):
        h0 = torch.randn(self.num_layers, x.size(0), self.hidden_nodes).to(device)
        c0 = torch.randn(self.num_layers, x.size(0), self.hidden_nodes).to(device)

        x = self.embedding(x)
        x, _ = self.lstm(x, [h0, c0])

        # Consider only last hidden state
        x = F.softmax(self.fc(x[:, -1, :]), dim=0)  

        return x

In [None]:
model = LSTM_model(vocab_size, hidden_nodes, num_layers, embedding_dim).to(device=device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=learning_rate)

In [None]:
def get_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()  # Switch model to evaluation mode

    with torch.no_grad():  # We don't need to compute gradients here
        for x, y in tqdm(loader, ascii="123456789=", desc="Evaluating:"):
            x = x.to(device=device)
            y = y.to(device=device)

            scores = model(x)
            loss = criterion(scores, y)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)  # predictions.shape[0]

        accuracy = (num_correct / num_samples) * 100
        print(f"Loss: {loss.item()}, Accuracy: {num_correct} / {num_samples} = {accuracy: .3f}%")

    # Switch back to training mode
    model.train()
    return accuracy

In [None]:
def train_loop():
    print("Training begins..")
    for epoch in range(num_epochs):
        num_correct = 0
        num_samples = 0

        loop = tqdm(enumerate(train_loader), total=len(train_loader), ascii=" 123456789=")
        for batch_idx, (data, targets) in loop:
            data = data.to(device=device)
            # targets = targets.view(-1, 1).to(torch.float32)
            targets = targets.to(device=device)

            # Forward step
            scores = model(data)
            loss = criterion(scores, targets)

            # Backward step
            optimizer.zero_grad()  # To clear out previous step's gradients
            loss.backward()

            # Gradient descent
            optimizer.step()

            # Calculate ratio of correct predictions
            _, predictions = scores.max(1)
            num_correct += (predictions == targets).sum()
            num_samples += predictions.size(0)  # predictions.shape[0]

            # Update loss and accuracy on progress bar
            accuracy = (num_correct / num_samples)
            loop.set_description(f"=> Epoch {epoch + 1}/{num_epochs}")
            loop.set_postfix(loss=loss.item(), accuracy=accuracy.item())

In [None]:
train_loop()

Training begins..




In [None]:
test_accuracy = get_accuracy(test_loader, model)



Loss: 0.6668935418128967, Accuracy: 1640 / 1829 =  89.666%



