In [None]:
import torch
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
import random
import pandas as pd
import numpy as np

In [None]:
"""
In this task I opt to create my own dataset using ag_news csv files rather than
using torchtext inbuilt dataset. This is with the intention to have more
flexibility with respect to changing the data format as required for our model.

Download the ag_news dataset tar file from the link below:
https://drive.google.com/drive/u/0/folders/0Bz8a_Dbh9Qhbfll6bVpmNUtUcFdjYmF2SEpmZUZUcVNiMUw1TWN6RDV3a0JHT3kxLVhVR2M

Upload the downloaded tar file after running the below code snippet.
"""
from google.colab import files
uploaded = files.upload()

Saving ag_news_csv.tar.gz to ag_news_csv.tar.gz


In [None]:
# Extracting the dataset folder to obtain csv files
!tar xvzf  ag_news_csv.tar.gz

ag_news_csv/
ag_news_csv/train.csv
ag_news_csv/test.csv
ag_news_csv/classes.txt
ag_news_csv/readme.txt


In [None]:
# Obtaining a dataframe from the train.csv file
df = pd.read_csv('ag_news_csv/train.csv', header=None, index_col=None)

# Assigning names to the columns
df.columns = ['label', 'title', 'text']

# By inspection it is found that class labels are in the range {1,2,3,4}
# In order to maintain consistency with inbuilt dataset,
# the range is changed to {0,1,2,3}
df['label'] = df['label']-1

# The data with named colums and changed label range is stored in csv
df[['label', 'text']].to_csv('ag_news_csv/train_labeled.csv', index=None)

# Similar steps are carried out for test.csv file
df = pd.read_csv('ag_news_csv/test.csv', header=None, index_col=None)
df.columns = ['label', 'title', 'text']
df['label'] = df['label']-1
df[['label', 'text']].to_csv('ag_news_csv/test_labeled.csv', index=None)

# Delete the dataframe
del df

In [None]:
# A method that creates tabular dataset using the csv file in the given path
# according to the fields specified
def get_dataset_from_csv(path_to_csv, fields):
    dataset = data.TabularDataset(path = path_to_csv, format='csv',
    skip_header=True, fields=fields)
    return dataset

In [None]:
""" Using the Field and LabelField options of torchtext to define the
preprocessing to be performed on the data before converting to tensor.
sequential: is set to true since our data is sequential
tokenize: spacy is used because it performs better than the default str.split
include_length: true because we require pack_padded_sequence """
TEXT = data.Field(sequential=True,
                  tokenize='spacy',
                  include_lengths=True)

LABEL = data.LabelField(dtype=torch.float)

In [None]:
# List of fields required in the dataset
fields = [('label', LABEL), ('text', TEXT)]

# Getting train and test dataset from respective csv files
train_dataset = get_dataset_from_csv("ag_news_csv/train_labeled.csv", fields)
test_dataset = get_dataset_from_csv("ag_news_csv/test_labeled.csv", fields)

In [None]:
# Since there is no seperate validation dataset we split the train dataset
# split ratio is 0.95 (Same as in task 1)
train_data, valid_data = train_dataset.split(split_ratio=[0.95, 0.05],
    random_state=random.seed(100))

In [None]:
torch.backends.cudnn.deterministic = True
torch.manual_seed(100)

VOCABULARY_SIZE = 5000
BATCH_SIZE = 128
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = 4

In [None]:
"""
I use the pre-trained embeddings since symantically closed words are close in
this embedding. I have taken the embedding of dimension 100 obtained by training
on 6 billion tokens.
"""
TEXT.build_vocab(train_data, max_size=VOCABULARY_SIZE, vectors='glove.6B.100d',
                 unk_init=torch.Tensor.normal_)

LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:26, 2.23MB/s]                           
100%|█████████▉| 399379/400000 [00:16<00:00, 24737.75it/s]

In [None]:
"""
I obtain the iterators for train, validation and test dataset.
BucketIterator returns an a batch containing samples of similar size.
Since I have packed_padded_sequence, it is necessary to sort within batch
according to length. Hence, sort_within_batch is set true and sort_key is used
for defining sorting criterion as length.
"""
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (train_data, valid_data, test_dataset), batch_size=BATCH_SIZE,
    sort_within_batch=True, sort_key=lambda l: len(l.text), device=DEVICE)

In [None]:
import torch.nn as nn

"""
In the model I have tried to implement a variation of the RNN model.
I have used LSTM as it considers wider context. I have used Bidirectional
LSTM because it considers future as well as past context.
I have used 2 layers of LSTM. I have also used dropout to minimise overfitting.
By trial and error I have chosen a dropout probability of 0.5
"""
class Classifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, pad_idx):

        super().__init__()

        # An embedding layer. Setting the padding_idx as pad_idx is to indicate to
        # embedding layer to not process padded tokens. It is left unchanged. Hence
        # when LSTM gets padding tokens it does not process them.
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)

        # Defining a biderection LSTM (bidirectional=True) with 2 layers(num_layers=2)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, dropout=0.5,
                           bidirectional=True)

        # Hidden states has both forward and backward component. Hence multiplied by 2
        # Intermediate linear layer of dimension = 32
        self.fc1 = nn.Linear(hidden_dim * 2, 32)

        # Final linear layer.
        self.fc2 = nn.Linear(32, output_dim)

        # Initializes the dropout layer
        self.dropout = nn.Dropout(0.5)

    def forward(self, text, text_length):
        # Apply droput on the embedding of the text
        embedded = self.dropout(self.embedding(text))
        # I want to pass the text length as I am using packed_padded_sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_length)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # As I have two layers I am concatenating the hidden states before passing
        # it to the next linear layer. After concatenating dropout is applied.
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        hidden = self.fc1(hidden)
        hidden = self.fc2(hidden)
        return hidden

In [None]:
# Input dimension is length of the vocabulary
INPUT_DIM = len(TEXT.vocab)
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

# Instantiating the model Classifier
model = Classifier(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, PAD_IDX)
# Moving the model to device
model = model.to(DEVICE)
# Instantiating the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

In [None]:
# Method to evaluate the model accuracy
def evaluate_result(model, data_loader, device):
    # Putting the model in the evaluation mode
    model.eval()
    valid_predictions = 0
    total_data_points = 0
    # Performing the below steps excluding from the torch grad
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(data_loader):
            text, text_lengths = batch_data.text
            # Obtaining the probabilities for datapoints in the current batch
            probabilities = model(text, text_lengths).squeeze(1)

            # Obtaining the indices (labels) of greatest value of probability for all elements in the batch
            _, response_labels = torch.max(probabilities, 1)

            # Maintaining the count of total data points
            total_data_points += batch_data.label.size(0)

            # Maintaining the count of total correct predictions
            valid_predictions += (response_labels.long() == batch_data.label.long()).sum()

        # returns the accuracy. Accuracy = correct_predictions/total_predictions
        return valid_predictions.float()/total_data_points * 100

In [None]:
epochs = 10
for epoch in range(epochs):
    # putting the model in the train mode
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):

        text, text_lengths = batch_data.text

        # Forward propogation
        output = model(text, text_lengths).squeeze(1)
        # cost is computed using cross_entropy as loss function
        cost = F.cross_entropy(output, batch_data.label.long())
        # Setting gradients to zero
        optimizer.zero_grad()

        # Performing back propogation
        cost.backward()

        # Updating the parameters
        optimizer.step()

    with torch.set_grad_enabled(False):
        print(f'Epoch: '
              f'{epoch}'
              f'\ntraining accuracy: '
              f'{evaluate_result(model, train_loader, DEVICE):.3f}%'
              f'\nvalidation accuracy: '
              f'{evaluate_result(model, valid_loader, DEVICE):.3f}%')

print(f'Test accuracy: {evaluate_result(model, test_loader, DEVICE):.2f}%')

Epoch: 0
training accuracy: 94.838%
validation accuracy: 91.000%
Epoch: 1
training accuracy: 95.365%
validation accuracy: 91.183%
Epoch: 2
training accuracy: 95.478%
validation accuracy: 91.317%
Epoch: 3
training accuracy: 95.636%
validation accuracy: 90.783%
Epoch: 4
training accuracy: 95.773%
validation accuracy: 91.183%
Epoch: 5
training accuracy: 96.156%
validation accuracy: 91.183%
Epoch: 6
training accuracy: 96.410%
validation accuracy: 91.200%
Epoch: 7
training accuracy: 96.479%
validation accuracy: 90.983%
Epoch: 8
training accuracy: 96.417%
validation accuracy: 91.150%
Epoch: 9
training accuracy: 96.709%
validation accuracy: 91.133%
Test accuracy: 91.07%
