<a href="https://colab.research.google.com/github/Nub-T/Noob_Computation/blob/main/SequenceClassification_using_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<b>SEQUENCE CLASSIFICATION</b><br>
--
Coded by :
* <a href="https://github.com/Nub-T">Nub-T</a>
* <a href="https://github.com/Andi-Nov">Andi-Nov</a>

In [1]:
!wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz

--2020-12-09 21:32:41--  http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2020-12-09 21:32:42 (58.5 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [2]:
!tar -xzf aclImdb_v1.tar.gz

In [3]:
# Now import the library!
import torch
import math
import numpy as np
import torch.nn as nn
from torch.utils.data import DataLoader
from torch import optim
import os
from collections import namedtuple

np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f2dafd7cba0>

In [4]:
train_path = "aclImdb/train/" 
test_path = "aclImdb/test/"

In [5]:
batch_size = 100
max_len = 300
embedding_size = 300
min_count = 2
device = torch.device('cuda')

In [6]:
Sentence = namedtuple('Sentence', ['index', 'tokens', 'label'])

In [7]:
def read_imdb_movie_dataset(dataset_path):
    indices = []
    text = []
    rating = []
    i = 0

    for filename in os.listdir(os.path.join(dataset_path, "pos")):
        file_path = os.path.join(dataset_path, "pos", filename)
        data = open(file_path, 'r', encoding="ISO-8859-1").read()
        indices.append(i)
        text.append(data)
        rating.append(1)
        i = i + 1

    for filename in os.listdir(os.path.join(dataset_path, "neg")):
        file_path = os.path.join(dataset_path, "neg", filename)
        data = open(file_path, 'r', encoding="ISO-8859-1").read()
        indices.append(i)
        text.append(data)
        rating.append(0)
        i = i + 1

    sentences = [ Sentence(index, text.split(), rating)
                  for index, text, rating in zip(indices, text, rating)]

    return sentences

In [8]:
train_examples = read_imdb_movie_dataset(train_path)
test_examples = read_imdb_movie_dataset(test_path)

In [9]:
len(train_examples)

25000

In [10]:
len(test_examples)

25000

In [11]:
UNK = '<UNK>'
PAD = '<PAD>'
BOS = '<BOS>'
EOS = '<EOS>'

In [12]:
class VocabItem:
    def __init__(self, string, hash=None):
        self.string = string
        self.count = 0
        self.hash = hash

    def __str__(self):
        return 'VocabItem({})'.format(self.string)

    def __repr__(self):
        return self.__str__()

In [13]:
class Vocab:
    def __init__(
        self,
        min_count=0,
        no_unk=False,
        add_padding=False,
        add_bos=False,
        add_eos=False,
        unk=None):
      
        self.no_unk = no_unk
        self.vocab_items = []
        self.vocab_hash = {}
        self.word_count = 0
        self.special_tokens = []
        self.min_count = min_count
        self.add_padding = add_padding
        self.add_bos = add_bos
        self.add_eos = add_eos
        self.unk = unk

        self.UNK = None
        self.PAD = None
        self.BOS = None
        self.EOS = None

        self.index2token = []
        self.token2index = {}

        self.finished = False

    def add_tokens(self, tokens):
        if self.finished:
            raise RuntimeError('Vocabulary is finished')

        for token in tokens:
            if token not in self.vocab_hash:
                self.vocab_hash[token] = len(self.vocab_items)
                self.vocab_items.append(VocabItem(token))

            self.vocab_items[self.vocab_hash[token]].count += 1
            self.word_count += 1

    def finish(self):

        token2index = self.token2index
        index2token = self.index2token

        tmp = []

        if not self.no_unk:
            if self.unk:
                self.UNK = VocabItem(self.unk, hash=0)
                self.UNK.count = self.vocab_items[self.vocab_hash[self.unk]].count
                index2token.append(self.UNK)
                self.special_tokens.append(self.UNK)

                for token in self.vocab_items:
                    if token.string != self.unk:
                        tmp.append(token)

            else:
                self.UNK = VocabItem(UNK, hash=0)
                index2token.append(self.UNK)
                self.special_tokens.append(self.UNK)

                for token in self.vocab_items:
                    if token.count <= self.min_count:
                        self.UNK.count += token.count
                    else:
                        tmp.append(token)
        else:
            for token in self.vocab_items:
                tmp.append(token)
                
        tmp.sort(key=lambda token: token.count, reverse=True)

        if self.add_bos:
            self.BOS = VocabItem(BOS)
            tmp.append(self.BOS)
            self.special_tokens.append(self.BOS)

        if self.add_eos:
            self.EOS = VocabItem(EOS)
            tmp.append(self.EOS)
            self.special_tokens.append(self.EOS)

        if self.add_padding:
            self.PAD = VocabItem(PAD)
            tmp.append(self.PAD)
            self.special_tokens.append(self.PAD)

        index2token += tmp
        for i, token in enumerate(self.index2token):
            token2index[token.string] = i
            token.hash = i

        self.index2token = index2token
        self.token2index = token2index

        if not self.no_unk:
            print('Unknown vocab size:', self.UNK.count)

        print('Vocab size: %d' % len(self))

        self.finished = True

    def __getitem__(self, i):
        return self.index2token[i]

    def __len__(self):
        return len(self.index2token)

    def __iter__(self):
        return iter(self.index2token)

    def __contains__(self, key):
        return key in self.token2index

    def tokens2indices(self, tokens, add_bos=False, add_eos=False):
        string_seq = []
        if add_bos:
            string_seq.append(self.BOS.hash)
        for token in tokens:
            if self.no_unk:
                string_seq.append(self.token2index[token])
            else:
                string_seq.append(self.token2index.get(token, self.UNK.hash))
        if add_eos:
            string_seq.append(self.EOS.hash)
        return string_seq

    def indices2tokens(self, indices, ignore_ids=()):
        tokens = []
        for idx in indices:
            if idx in ignore_ids:
                continue
            tokens.append(self.index2token[idx].string)

        return tokens

In [14]:
# We will use one vocabulary for the input data (the sentences), and another vocabulary object for the output data, the class labels. In this way our code is generic and should work out-of-the-box for any number of output labels.
src_vocab = Vocab(min_count=min_count, add_padding=True)
tgt_vocab = Vocab(no_unk=True, add_padding=False)

In [15]:
for sentence in train_examples:
    src_vocab.add_tokens(sentence.tokens[:max_len])
    tgt_vocab.add_tokens([sentence.label])

src_vocab.finish()
tgt_vocab.finish()

Unknown vocab size: 211949
Vocab size: 65055
Vocab size: 2


In [16]:
src_vocab.tokens2indices('the movie was bad'.split())

[1, 19, 13, 97]

In [17]:
Vocabs = namedtuple('Vocabs', ['src', 'tgt'])
vocabs = Vocabs(src_vocab, tgt_vocab)

In [18]:
embeddings = nn.Embedding(
    len(src_vocab),
    embedding_size,
    padding_idx=src_vocab.PAD.hash
)

In [19]:
embeddings.weight.size()

torch.Size([65055, 300])

In [20]:
# We will combine our Batch object with a BatchTuple object that will hold data relevant to a specific input of the model.

class Batch(dict):
    def __init__(self, *args, **kwargs):
        super(Batch, self).__init__(*args, **kwargs)
        self.__dict__ = self
        self._is_torch = False

    def to_torch_(self, device):
        self._is_torch = False
        for key in self.keys():
            value = self[key]
            
            if isinstance(value, BatchTuple):
                value.to_torch_(device)
                
            if isinstance(value, np.ndarray):
                self[key] = torch.from_numpy(value).to(device)


class BatchTuple(object):
    def __init__(self, sequences, lengths, sublengths, masks):
        self.sequences = sequences
        self.lengths = lengths
        self.sublengths = sublengths
        self.masks = masks
        self._is_torch = False

    def to_torch_(self, device):
        if not self._is_torch:
            self.sequences = torch.tensor(
                self.sequences, device=device, dtype=torch.long
            )

            if self.lengths is not None:
                self.lengths = torch.tensor(
                    self.lengths, device=device, dtype=torch.long
                )

            if self.sublengths is not None:
                self.sublengths = torch.tensor(
                    self.sublengths, device=device, dtype=torch.long
                )
            if self.masks is not None:
                self.masks = torch.tensor(
                    self.masks, device=device, dtype=torch.float
                )

In [21]:
def pad_list(
    sequences,
    dim0_pad=None,
    dim1_pad=None,
    align_right=False,
    pad_value=0
):
    sequences = [np.asarray(sublist) for sublist in sequences]
    if not dim0_pad:
        dim0_pad = len(sequences)

    if not dim1_pad:
        dim1_pad = max(len(seq) for seq in sequences)

    out = np.full(shape=(dim0_pad, dim1_pad), fill_value=pad_value)
    lengths = []

    for i in range(len(sequences)):
        data_length = len(sequences[i])
        lengths.append(data_length)
        offset = dim1_pad - data_length if align_right else 0
        np.put(out[i], range(offset, offset + data_length), sequences[i])

    lengths = np.array(lengths)
    return out, lengths

In [22]:
class SequenceClassificationBatchBuilder(object):
    def __init__(self, vocabs, max_len=None):
        self.vocabs = vocabs
        self.max_len = max_len
        
    def __call__(self, examples):
        ids_batch = [int(sentence.index) for sentence in examples]

        src_examples = [
            self.vocabs.src.tokens2indices(sentence.tokens[: self.max_len])
            for sentence in examples
        ]

        tgt_examples = [
            self.vocabs.tgt.token2index[sentence.label] for sentence in examples
        ]

        src_padded, src_lengths = pad_list(
            src_examples, pad_value=self.vocabs.src.PAD.hash
        )

        src_batch_tuple = BatchTuple(src_padded, src_lengths, None, None)
        tgt_batch_tuple = BatchTuple(tgt_examples, None, None, None)

        return Batch(
            indices=ids_batch, src=src_batch_tuple, tgt=tgt_batch_tuple
        )

In [23]:
# Let's instance our batch_builder, feed it into the DataLoader object alongside the training and test examples, and let's inspect a single batch of examples.

batch_builder = SequenceClassificationBatchBuilder(
    vocabs, max_len=max_len
)

train_batches = DataLoader(
    train_examples,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
    collate_fn=batch_builder,
)

test_batches = DataLoader(
    test_examples,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0,
    collate_fn=batch_builder,
)

In [24]:
train_batches_iter = iter(train_batches)
train_batch = next(train_batches_iter)
train_batch.src.sequences

array([[  753,  7271,     6, ..., 65054, 65054, 65054],
       [ 8489,    25,  1171, ..., 65054, 65054, 65054],
       [  399,   283,  1491, ..., 65054, 65054, 65054],
       ...,
       [    0, 41565,     0, ..., 65054, 65054, 65054],
       [45298,     0,     3, ..., 65054, 65054, 65054],
       [   17,   128,     6, ..., 65054, 65054, 65054]])

In [25]:
def mean_pooling(batch_hidden_states, batch_lengths):
    batch_lengths = batch_lengths.float()
    batch_lengths = batch_lengths.unsqueeze(1)
    if batch_hidden_states.is_cuda:
        batch_lengths = batch_lengths.cuda()

    pooled_batch = torch.sum(batch_hidden_states, 1)
    pooled_batch = pooled_batch / batch_lengths.expand_as(pooled_batch)

    return pooled_batch

def max_pooling(batch_hidden_states):
    pooled_batch, _ = torch.max(batch_hidden_states, 1)
    return pooled_batch

def pack_rnn_input(embedded_sequence_batch, sequence_lengths):
    sequence_lengths = sequence_lengths.cpu().numpy()

    sorted_sequence_lengths = np.sort(sequence_lengths)[::-1]
    sorted_sequence_lengths = torch.from_numpy(
        sorted_sequence_lengths.copy()
    )

    idx_sort = np.argsort(-sequence_lengths)
    idx_unsort = np.argsort(idx_sort)

    idx_sort = torch.from_numpy(idx_sort)
    idx_unsort = torch.from_numpy(idx_unsort)

    if embedded_sequence_batch.is_cuda:
        idx_sort = idx_sort.cuda()
        idx_unsort = idx_unsort.cuda()

    embedded_sequence_batch = embedded_sequence_batch.index_select(
        0, idx_sort
    )

    packed_rnn_input = nn.utils.rnn.pack_padded_sequence(
        embedded_sequence_batch, 
        sorted_sequence_lengths,
        batch_first=True
    )

    return packed_rnn_input, idx_unsort

  
def unpack_rnn_output(packed_rnn_output, indices):
    encoded_sequence_batch, _ = nn.utils.rnn.pad_packed_sequence(
        packed_rnn_output, batch_first=True
    )
    encoded_sequence_batch = encoded_sequence_batch.index_select(0, indices)

    return encoded_sequence_batch

In [26]:
# Use the nn.module !

class BiLSTM(nn.Module):

    def __init__(
        self,
        embeddings,
        hidden_size,
        num_labels,
        input_dropout=0,
        output_dropout=0,
        bidirectional=True,
        num_layers=2,
        pooling='mean'
    ):

        super(BiLSTM, self).__init__()
        self.embeddings = embeddings
        self.pooling = pooling
        self.input_dropout = nn.Dropout(input_dropout)
        self.output_dropout = nn.Dropout(output_dropout)
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.num_labels = num_labels
        self.hidden_size = hidden_size
        self.input_size = self.embeddings.embedding_dim
        self.lstm = nn.LSTM(
            self.input_size,
            hidden_size,
            bidirectional=bidirectional,
            num_layers=num_layers,
            batch_first=True
        )
        self.total_hidden_size = self.hidden_size 
        if self.bidirectional:
            self.total_hidden_size += self.hidden_size
        self.output_layer = nn.Linear(
            self.total_hidden_size,
            self.num_labels)
        self.loss_function = nn.CrossEntropyLoss()

    def forward(self, src_batch, tgt_batch=None):
        src_sequences = src_batch.sequences
        src_lengths = src_batch.lengths
        embedded_sequence_batch = self.embeddings(src_sequences)
        embedded_sequence_batch = self.input_dropout(
            embedded_sequence_batch
        )

        packed_rnn_input, indices = pack_rnn_input(
            embedded_sequence_batch, src_lengths
        )

        rnn_packed_output, _ = self.lstm(packed_rnn_input)
        encoded_sequence_batch = unpack_rnn_output(
            rnn_packed_output, indices
        )

        if self.pooling == "mean":
            pooled_batch = mean_pooling(encoded_sequence_batch,
                                        src_lengths)

        elif self.pooling == "max":
            pooled_batch = max_pooling(encoded_sequence_batch)
        else:
            raise NotImplementedError

        logits = self.output_layer(pooled_batch)
        _, predictions = logits.max(1)

        if tgt_batch is not None:
            targets = tgt_batch.sequences
            loss = self.loss_function(logits, targets)
        else:
            loss = None

        return loss, predictions, logits

In [33]:
# Now we will set our hyperparameters!

epochs = 5
hidden_size = 300
log_interval = 10
num_labels = 2
input_dropout = 0.5
output_dropout = 0.5
bidirectional = True
num_layers = 2
pooling = 'mean'
lr = 0.001
gradient_clipping = 0.25

In [34]:
model = BiLSTM(
    embeddings=embeddings,
    hidden_size=hidden_size,
    num_labels=num_labels,
    input_dropout=input_dropout,
    output_dropout=output_dropout,
    bidirectional=bidirectional,
    num_layers=num_layers,
    pooling=pooling
)

model.to(device)

BiLSTM(
  (embeddings): Embedding(65055, 300, padding_idx=65054)
  (input_dropout): Dropout(p=0.5, inplace=False)
  (output_dropout): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(300, 300, num_layers=2, batch_first=True, bidirectional=True)
  (output_layer): Linear(in_features=600, out_features=2, bias=True)
  (loss_function): CrossEntropyLoss()
)

In [35]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [36]:
for epoch in range(epochs):
    epoch_correct = 0
    epoch_total = 0
    epoch_loss = 0
    i = 0
    model.train()

    for batch in train_batches:
        batch.to_torch_(device)
        ids_batch = batch.indices
        src_batch = batch.src
        tgt_batch = batch.tgt
        loss, predictions, logits = model.forward(
            src_batch,
            tgt_batch=tgt_batch
        )

        loss.backward()

        torch.nn.utils.clip_grad_norm_(
            model.parameters(),
            gradient_clipping)

        optimizer.step()
        correct = (predictions == tgt_batch.sequences).long().sum()
        total = tgt_batch.sequences.size(0)
        epoch_correct += correct.item()
        epoch_total += total
        epoch_loss += loss.item()
        i += 1

    accuracy  = 100 * epoch_correct / epoch_total

    print('Epoch {}'.format(epoch))
    print('Train Loss: {}'.format(epoch_loss / len(train_batches)))
    print('Train Accuracy: {}'.format(accuracy))

    test_epoch_correct = 0
    test_epoch_total = 0
    test_epoch_loss = 0

    model.eval()

    for batch in test_batches:
        ids_batch = batch.indices
        src_batch = batch.src
        tgt_batch = batch.tgt
        
        batch.to_torch_(device)
        loss, predictions, logits = model.forward(
            src_batch,
            tgt_batch=tgt_batch)

        correct = (predictions == tgt_batch.sequences).long().sum()
        total = tgt_batch.sequences.size(0)
        test_epoch_correct += correct.item()
        test_epoch_total += total
        test_epoch_loss += loss.item()

    test_accuracy = 100 * test_epoch_correct / test_epoch_total

    print('\n----------------------')
    print('Test Loss: {}'.format(test_epoch_loss / len(test_batches)))
    print('Test Accuracy: {}'.format(test_accuracy))
    print('----------------------\n')

Epoch 0
Train Loss: 0.4404130145907402
Train Accuracy: 78.636

----------------------
Test Loss: 0.35737835097312926
Test Accuracy: 84.908
----------------------

Epoch 1
Train Loss: 0.302647270321846
Train Accuracy: 87.512

----------------------
Test Loss: 0.32877466711401937
Test Accuracy: 86.344
----------------------

Epoch 2
Train Loss: 0.2385834891498089
Train Accuracy: 90.132

----------------------
Test Loss: 0.3091318444907665
Test Accuracy: 87.568
----------------------

Epoch 3
Train Loss: 0.17792687624692916
Train Accuracy: 93.084

----------------------
Test Loss: 0.3724753656387329
Test Accuracy: 87.032
----------------------

Epoch 4
Train Loss: 0.14103973174095155
Train Accuracy: 94.596

----------------------
Test Loss: 0.3948624557703733
Test Accuracy: 86.92
----------------------

