<a href="https://colab.research.google.com/github/TalitaAnthonio/COLING/blob/master/BiLSTM_revisions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BiLSTM with Wikihow 

*This notebook is used to run the examples with wikihow. 

### Read data

In [0]:
from torchtext import data 
from torchtext import datasets
import torch

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [74]:
from google.colab import drive 


drive.mount('/content/drive')
path_to_dev = "/content/drive/My Drive/data/wikihow/dev_set_pytorch.json"
path_to_test = "/content/drive/My Drive/data/wikihow/test_set_pytorch.json"
path_to_train = "/content/drive/My Drive/data/wikihow/train_set_pytorch.json"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# define the fields 

LABEL = data.LabelField(dtype=torch.float)
LINE = data.Field(tokenize='spacy', lower=True, include_lengths=True)
#CONTEXT = data.Field()

**notes from notebook**


Also, if the values of json field are a string then the Fields tokenization is applied (default is to split the string on spaces), however if the values are a list then no tokenization is applied. Usually it is a good idea for the data to already be tokenized into a list, this saves time as you don't have to wait for TorchText to do it.

In [0]:
fields = {'Label': ('label', LABEL), 'Line': ('line', LINE)}

In [0]:
train_data, validation_data, test_data = data.TabularDataset.splits(
                            path = 'data',
                            train = path_to_train,
                            validation = path_to_dev, 
                            test = path_to_test,
                            format = 'json',
                            fields = fields
)

In [78]:
vars(validation_data[0])

{'label': '0',
 'line': ['nothing',
  'is',
  'less',
  'tastier',
  'when',
  'you',
  'see',
  'a',
  'men',
  'with',
  'a',
  'suit',
  'to',
  'large',
  'or',
  'an',
  'outfit',
  'not',
  'in',
  'season',
  ';',
  ')',
  '.']}

## Build Vocabulary

Build vocabulary for ``LINE`` and ``LABEL``.

In [0]:
MAX_VOCAB_SIZE = 20000

LINE.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

In [80]:
print("the vocabulary size is {0}".format(len(LINE.vocab)))
print("the number of unique labels is {0}".format(len(LABEL.vocab)))

print(vars(LABEL))

the vocabulary size is 20002
the number of unique labels is 2
{'sequential': False, 'use_vocab': True, 'init_token': None, 'eos_token': None, 'unk_token': None, 'fix_length': None, 'dtype': torch.float32, 'preprocessing': None, 'postprocessing': None, 'lower': False, 'tokenize': <function Field.<lambda> at 0x7f81518c68c8>, 'include_lengths': False, 'batch_first': False, 'pad_token': None, 'pad_first': False, 'truncate_first': False, 'stop_words': None, 'is_target': False, 'vocab': <torchtext.vocab.Vocab object at 0x7f80edb0b160>}


## Make Batch sizes 

In [0]:
BATCH_SIZE = 32 # not 64 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, validation_data, test_data), 
    batch_size = BATCH_SIZE, sort_key = lambda x : len(x.line), 
    sort_within_batch = True,
    repeat=False, 
    device = device)

In [82]:
print(len(train_iterator))
print(len(valid_iterator))
print(len(test_iterator))

2760
320
308


In [83]:
vars(train_iterator)

{'_iterations_this_epoch': 0,
 '_random_state_this_epoch': None,
 '_restored_from_state': False,
 'batch_size': 32,
 'batch_size_fn': None,
 'dataset': <torchtext.data.dataset.TabularDataset at 0x7f80edb0be80>,
 'device': device(type='cuda'),
 'iterations': 0,
 'random_shuffler': <torchtext.data.utils.RandomShuffler at 0x7f80bf585ba8>,
 'repeat': False,
 'shuffle': True,
 'sort': False,
 'sort_key': <function __main__.<lambda>>,
 'sort_within_batch': True,
 'train': True}

## Build the model

In [0]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        # Documentation: padding_idx (python:int, optional) – If given, pads the output with the embedding 
        # vector at padding_idx (initialized to zeros) whenever it encounters the index.
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [0]:
# makes sense because the glove vectors are also 100d 
INPUT_DIM = len(LINE.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True

# we use a dropout probability of 1 
DROPOUT = 0.5
PAD_IDX = LINE.vocab.stoi[LINE.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

## Use pre-trained embeddings 

In [86]:
pretrained_embeddings = LINE.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([20002, 100])


In [87]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [-0.1353, -0.1401, -0.1164,  ...,  0.3354, -0.2014, -0.5799],
        [-0.0764,  0.3016, -0.0953,  ...,  0.1170, -0.6364,  0.6872],
        [ 0.1417,  0.2383,  0.5077,  ..., -0.1551, -0.1413, -0.5351]])

In [88]:
UNK_IDX = LINE.vocab.stoi[LINE.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [-0.1353, -0.1401, -0.1164,  ...,  0.3354, -0.2014, -0.5799],
        [-0.0764,  0.3016, -0.0953,  ...,  0.1170, -0.6364,  0.6872],
        [ 0.1417,  0.2383,  0.5077,  ..., -0.1551, -0.1413, -0.5351]])


## Train the model 

In [0]:
import torch.optim as optim

# it's not necessary to specify a learning rate here. 
optimizer = optim.Adam(model.parameters())
# we use the same loss as previously 
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [0]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()

        # this happens now because 
        # As we have set include_lengths = True, our batch.text is 
        # now a tuple with the first element being the numericalized tensor and 
        # the second element being the actual lengths of each sequence. 
        # We separate these into their own variables, text and text_lengths, before passing them to the model.
        
        text, text_lengths = batch.line
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.line
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)
            correct = get_evaluation_scores(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
N_EPOCHS = 10


for epoch in range(N_EPOCHS):
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    
    print("Epoch {0}".format(epoch+1))
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [0]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

In [0]:
# this can later be used if I want 
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix 

def get_evaluation_scores(preds, y):
    categories = {"0.0": "Non-Hyperpartisan", "1.0": "Hyperpartisan"} 
    rounded_preds = torch.round(torch.sigmoid(preds))
    Ypredict = []
    Ytrue = []
    for ypred in rounded_preds: 
        Ypredict.append(ypred.item())
    for ytrue in y: 
        Ytrue.append(ytrue.item())
    return classification_report(Ytrue, Ypredict)