<a href="https://colab.research.google.com/github/Pauls-Baby/imdb-dataset/blob/master/imdb_batch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
import torch
from torchtext import data, datasets
import random
from sklearn.model_selection import KFold
import numpy as np
from pathlib import Path
import json
import sys
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math

In [30]:
class load_data(object):
    def __init__(self, SEED=1234):
        torch.manual_seed(SEED)
        torch.cuda.manual_seed(SEED)
        torch.backends.cudnn.deterministic = True

        TEXT = data.Field(tokenize='spacy')
        LABEL = data.LabelField(dtype=torch.float)

        self.train_data, self.test_data = datasets.IMDB.splits(TEXT, LABEL)
        self.SEED = SEED

    def get_fold_data(self, num_folds=3):
        TEXT = data.Field(tokenize='spacy')
        LABEL = data.LabelField(dtype=torch.float)
        fields = [('text', TEXT), ('label', LABEL)]
        
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=self.SEED)
        train_data_arr = np.array(self.train_data.examples)

        for train_index, val_index in kf.split(train_data_arr):
            yield(
                TEXT,
                LABEL,
                data.Dataset(train_data_arr[train_index], fields=fields),
                data.Dataset(train_data_arr[val_index], fields=fields),
            )
        print("Data for 3-Fold validation generated")

    def get_test_data(self):
        return self.test_data

In [31]:
class Self_Attention(nn.Module):
    def __init__(self, query_dim):
        # assume: query_dim = key/value_dim
        super(Self_Attention, self).__init__()
        self.scale = 1. / math.sqrt(query_dim)

    def forward(self, query, key, value):
        # query == hidden: (batch_size, hidden_dim * 2)
        # key/value == gru_output: (sentence_length, batch_size, hidden_dim * 2)
        query = query.unsqueeze(1) # (batch_size, 1, hidden_dim * 2)
        key = key.transpose(0, 1).transpose(1, 2) # (batch_size, hidden_dim * 2, sentence_length)

        # bmm: batch matrix-matrix multiplication
        attention_weight = torch.bmm(query, key) # (batch_size, 1, sentence_length)
        attention_weight = F.softmax(attention_weight.mul_(self.scale), dim=2) # normalize sentence_length's dimension

        value = value.transpose(0, 1) # (batch_size, sentence_length, hidden_dim * 2)
        attention_output = torch.bmm(attention_weight, value) # (batch_size, 1, hidden_dim * 2)
        attention_output = attention_output.squeeze(1) # (batch_size, hidden_dim * 2)

        return attention_output, attention_weight.squeeze(1)

In [32]:
class Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, d_rate,
                embedding_weights=None, embedding_trainable=True):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if embedding_weights is not None:
            self.embedding.weight.data.copy_(embedding_weights)
        if embedding_trainable is False:
            self.embedding.weight.requires_grad = False

        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=num_layers, 
                        bidirectional=True, dropout=d_rate)
        self.dense = nn.Linear(2 * hidden_dim, output_dim)
        self.dropout = nn.Dropout(d_rate)
        self.attention = Self_Attention(2 * hidden_dim)
    
    def forward(self, x):
        # x: (sentence_length, batch_size)

        embedded = self.dropout(self.embedding(x))
        # embedded: (sentence_length, batch_size, embedding_dim)

        gru_output, hidden = self.gru(embedded)
        # gru_output: (sentence_length, batch_size, hidden_dim * 2)
        ## depth_wise
        # hidden: (num_layers * 2, batch_size, hidden_dim)
        ## ordered: [f_layer_0, b_layer_0, ...f_layer_n, b_layer n]

        # concat the final output of forward direction and backward direction
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        # hidden: (batch_size, hidden_dim * 2)

        rescaled_hidden, attention_weight = self.attention(query=hidden, key=gru_output, value=gru_output)
        output = self.dense(rescaled_hidden)

        return output.squeeze(1), attention_weight

In [33]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)

    return acc

In [34]:
args_p = Path("/content/drive/My Drive/imdb/param.json")
if args_p.exists() is False:
    raise Exception('Path not found. Please check path to paramters json file!')
else:
    print("Hyperparameters are chosen from JSON file")

with args_p.open(mode='r') as f:
    true = True
    false = False
    null = None
    args = json.load(f)

Hyperparameters are chosen from JSON file


In [36]:
def train_run(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    i = 0
    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        output, _ = model(batch.text)
        loss = criterion(output, batch.label)
        acc = binary_accuracy(output, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        print("Training Batch:",i)
        i = i + 1
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [37]:
def eval_run(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    i = 0
    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions, _ = model(batch.text)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            print("Evaluating Batch:",i)
            i = i + 1

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [38]:
import datetime
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

In [39]:
import logging
logfile = str('/content/drive/My Drive/log/log-{}.txt'.format(run_start_time))
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO,
                    handlers=[
                        logging.FileHandler(logfile),
                        logging.StreamHandler(sys.stdout)
                    ])
logger = logging.getLogger(__name__)

In [40]:
def main():
    data_generator = load_data()
    _history = []
    device = None
    model = None
    criterion = None
    fold_index = 0

    for TEXT, LABEL, train_data, val_data in data_generator.get_fold_data(num_folds=args['num_folds']):
        logger.info("***** Running Training *****")
        logger.info(f"Now fold: {fold_index + 1} / {args['num_folds']}")

        TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.300d")
        logger.info(f'Embedding size: {TEXT.vocab.vectors.size()}.')
        LABEL.build_vocab(train_data) # For converting str into float labels.
        print("1")
        model = Model(len(TEXT.vocab), args['embedding_dim'], args['hidden_dim'],
            args['output_dim'], args['num_layers'], args['dropout'], TEXT.vocab.vectors, args["embedding_trainable"])
        print("Model is generated")       
        optimizer = optim.Adam(model.parameters())
        criterion = nn.BCEWithLogitsLoss()

        if args['gpu'] is True and args['gpu_number'] is not None:
            torch.cuda.set_device(args['gpu_number'])
            device = torch.device('cuda')
            model = model.to(device)
            criterion = criterion.to(device)
            print("GPU used for execution")
        else:
            device = torch.device('cpu')
            model = model.to(device)
            criterion = criterion.to(device)
            print("CPU used for execution")
        
        train_iterator = data.Iterator(train_data, batch_size=args['batch_size'], sort_key=lambda x: len(x.text), device=device)
        val_iterator = data.Iterator(val_data, batch_size=args['batch_size'], sort_key=lambda x: len(x.text), device=device)
        print("Training and validation Iterators are created")
        for epoch in range(args['epochs']):
            train_loss, train_acc = train_run(model, train_iterator, optimizer, criterion)
            logger.info(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        val_loss, val_acc = eval_run(model, val_iterator, criterion)
        logger.info(f'Val. Loss: {val_loss:.3f} | Val. Acc: {val_acc*100:.2f}% |')

        _history.append([val_loss, val_acc])
        fold_index += 1
        print("Evaluation over")
    _history = np.asarray(_history)
    loss = np.mean(_history[:, 0])
    acc = np.mean(_history[:, 1])
    
    logger.info('***** Cross Validation Result *****')
    logger.info(f'LOSS: {loss}, ACC: {acc}')

In [41]:
if __name__ == '__main__':
    main()

07/26/2020 18:52:46 - INFO - __main__ -   ***** Running Training *****
07/26/2020 18:52:46 - INFO - __main__ -   Now fold: 1 / 3
07/26/2020 18:52:48 - INFO - torchtext.vocab -   Loading vectors from .vector_cache/glove.6B.300d.txt.pt
07/26/2020 18:52:48 - INFO - __main__ -   Embedding size: torch.Size([25002, 300]).
1
Model is generated
GPU used for execution
Training and validation Iterators are created
Training Batch: 0
Training Batch: 1
Training Batch: 2
Training Batch: 3
Training Batch: 4
Training Batch: 5
Training Batch: 6
Training Batch: 7
Training Batch: 8
Training Batch: 9
Training Batch: 10
Training Batch: 11
Training Batch: 12
Training Batch: 13
Training Batch: 14
Training Batch: 15
Training Batch: 16
Training Batch: 17
Training Batch: 18
Training Batch: 19
Training Batch: 20
Training Batch: 21
Training Batch: 22
Training Batch: 23
Training Batch: 24
Training Batch: 25
Training Batch: 26
Training Batch: 27
Training Batch: 28
Training Batch: 29
Training Batch: 30
Training Batc