In [2]:
import argparse
import logging
import time

import torch
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import DATASETS
from torchtext.prototype.transforms import load_sp_model, PRETRAINED_SP_MODEL, SentencePieceTokenizer
from torchtext.utils import download_from_url
from torchtext.vocab import build_vocab_from_iterator
import torchtext
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torchtext.vocab import GloVe, FastText

### Constants

In [3]:
DATASET = "AG_NEWS"
DATA_DIR = ".data"
DEVICE = "cpu"
EMBED_DIM = 300
LR = 4.0
BATCH_SIZE = 16
NUM_EPOCHS = 5
PADDING_VALUE = 0
PADDING_IDX = PADDING_VALUE
PAD = '<pad>'
UNK = '<unk>'


### Get the tokenizer
- Different models tolenize in different ways. 
    - Word2Vec / GloVe does words (WordLevel).


In [4]:
# Get the basic english tokenizer using the get_tokenizer function.
basic_english_tokenizer = get_tokenizer("basic_english")

In [5]:
# Do not remove this.
assert(len(basic_english_tokenizer("This is some text ...")) == 7)

In [6]:
# Needed later.
TOKENIZER = basic_english_tokenizer

### Get the data and get the vocabulary.

In [7]:
# This function should loop over the (label, text) data pair and tokenize the text.
# It should yield a list of the tokens for each text.
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield TOKENIZER(text)

In [16]:
train_iter = DATASETS[DATASET](root=DATA_DIR, split="train")
# Use build_vocab_from_iterator to build the vocabulary.
# This function should take yield_tokens.
# The special characters are PAD and UNK.
# Build the vocabulary with the special tokens PAD and UNK
vocab = build_vocab_from_iterator(yield_tokens(train_iter),specials = [PAD, UNK])

#Print the vocabulary -> too big so i only printed the length 
print(len(vocab.get_stoi()))
# Make the default index the same as that of the unk_token.
default_index = stoi[UNK]
vocab.set_default_index(default_index)

# word -> int mapping.
stoi = vocab.get_stoi()
# int -> word mapping.
itos = vocab.get_itos() 

95812


### Get GloVe vectors

Information about pretrained vectors: 
- https://pytorch.org/text/stable/_modules/torchtext/vocab/vectors.html#GloVe
- https://github.com/pytorch/text/blob/e3799a6eecef451f6e66c9c20b6432c5f078697f/torchtext/vocab/vectors.py#L263

In [9]:
# Set GLOVE to the name='840B' GloVe vectors of dimension 300. 
GLOVE = GloVe(name ='840B')

If the embeddings are not in the token space, a zero vector will be returned.

In [10]:
# Get the vectors for all the tokens in s = "Hello, How are you?"
# Look up "get_vecs_by_tokens" for GloVe vectors.
# Add an assertion checking that the dimensions of wat you get is dimension (???, 300).
s = "Hello, How are you?"
s_tokens = TOKENIZER(s)
ret = GLOVE.get_vecs_by_tokens(s_tokens, lower_case_backup=True)
assert(ret.shape==(len(s_tokens),300))

In [11]:
# Let s = ""<pad> <unk> the man Man ahsdhashdahsdhash".
# What are the vectors of each token. Print this below.
s = "<pad> <unk> the man Man ahsdhashdahsdhash"
s_tokens = TOKENIZER(s)
ret = GLOVE.get_vecs_by_tokens(s_tokens, lower_case_backup=True)

### Helper functions

These functions tokenize the string input and then map each token to the integer representation in the vocabulary.

In [40]:
# Return for a sentence the int tokens for that sentence.
# I.e., you pass in "a b c d" and get out [1, 2, 3, 4].
def text_pipeline(text):
    return vocab(TOKENIZER(text))

# Return the label starting at 0. I.e. map each label to fo from 0, not 2 or whatever it starts from.
def label_pipeline(label):
    mapping = {1:0,2:1,3:2,4:3}
    return mapping[label]
            

Nice link on collate_fn and DataLoader in PyTorch: https://python.plainenglish.io/understanding-collate-fn-in-pytorch-f9d1742647d3

In [159]:
# As we loop through batches, this function gets applied to each raw batch.
def collate_batch(batch):
    label_list, text_list = [], []
    for (label,text) in batch:
        # Get the label from {1, 2, 3, 4} to {0, 1, 2, 3}
        # Append the label to the label_list.
        label_list.append(label_pipeline(label))
                
        # Return a list of ints.
        # Get a torch tensor of the sentence, this sould be a tensor of torch.int64.
        processed_text = torch.tensor(text_pipeline(text),dtype=torch.int64)
        text_list.append(processed_text.clone().detach())
    
    # Transform the label_list into a tensor. 
    label_list = torch.tensor(label_list)
    
    # Pad the list of text_list tensors so they all have the same length.
    # Use batch_first = True.
    # Use padding_valid = PADDING_VALUE
    text_list = pad_sequence(text_list, batch_first=True, padding_value=PADDING_VALUE)
            
    return label_list.to(DEVICE), text_list.to(DEVICE)

### Get the data

In [160]:
train_iter = DATASETS[DATASET](root=DATA_DIR, split="train")
# Get the number of classes.
num_class = 4
# What are the classes? -> [1,2,3,4] corresponding to [World,Sports,Business,Sci/Tech]
print(f"The number of classes is {num_class} ")

The number of classes is 4 


### Set up the model

In [209]:
# A more complicated model. We'll explore this after we learn word embeddings.
class TextClassificationModel(nn.Module):
    def __init__(
        self,
        vocab_size,
        embed_dim,
        num_class,
        initialize_with_glove = True,
        fine_tune_embeddings = True
    ):
        super(TextClassificationModel, self).__init__()
        # Set to an embedding of (vocab_size, embed_dim) size.
        # Use padding_idx = PADDING_IDX.
        # This is so we don't get gradients for padding tokens and use 0 as the vector for these.
        self.embedding = nn.Embedding(
            vocab_size,
            embed_dim,
            padding_idx=PADDING_IDX
        )
        
        if initialize_with_glove:
            # Turn off the gradient for the embedding weight as we are going to modify it. 
            self.embedding.weight.requires_grad_(False)
            for i in range(vocab_size):
                # Get the token index in VOCAB.
                token = itos[i]
                
                # Modify the embedding matrix to be the GloVe vector for this token.
                self.embedding.weight[i, :] = GLOVE.get_vecs_by_tokens(token, lower_case_backup=True)
            # Turn on the gradient after we modify it.
            # You could do this in another way by wrapping this in @torch.no_grad decorator.
            self.embedding.weight.requires_grad_(True)
        
        # No fine tuning means once you intialize, these are constant.
        if not fine_tune_embeddings:
            # Turn off the gradient for the embedding weight matric if you don't fine tune them.
            self.embedding.weight.requires_grad_(False)
        
        
        # Set fc to be a linear layer of dimension (embed_dim, num_class).
        self.fc = nn.Linear(embed_dim,num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        #self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text):
        # Get the embeddings for all tokens in the batch of text.
        embedded = self.embedding(text)
        # Across dimension 1, get the mean vector. This gets the mean vector per sentence in the batch.
        # Make sure you squeeze any dimension that's 1. This should be (N, d), where N is the batch dimension and d is the word vector dimension.
        embedded_sum = torch.sum(embedded, dim=1)
        # Get the number of non-padding tokens in each sentence
        lengths = torch.sum(text != PADDING_IDX, dim=1).unsqueeze(1)
        
        # Divide the sum of embeddings by the number of non-padding tokens to get the mean vector
        embedded_mean = embedded_sum / lengths
        # Run through a linear layer self.fc and also apply ReLU.
        
        logits = self.fc(F.relu(embedded_sum))
        return logits

### Set up the model

In [220]:
# Set to be the CrossEntropyLoss.
criterion = nn.CrossEntropyLoss()
# Set model to the TextClassification model.
# Turn on intialize_with_glove ad fine_tune_embeddings.
model = TextClassificationModel(vocab_size=len(vocab),
                                embed_dim=EMBED_DIM,
                                num_class=num_class,
                                initialize_with_glove=True,
                                fine_tune_embeddings=True)

# Set the optimizer for SGD with learning rate LR. The parameters are model.parameters.
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
# Schedule the learning rate decay to go down each epoch by 1/10.
scheduler =  torch.optim.lr_scheduler.StepLR(optimizer, 10, gamma=0.1)

### Set up the data

In [221]:
train_iter, test_iter = DATASETS[DATASET]()
# This puts things in a nice format.
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

# Set num_train to 95% length of train_dataset.
# This should be an integer.
num_train = int(len(train_dataset)*0.95)
num_val = len(train_dataset)-num_train
# The array below should have 2 ints in it, num_train, and the 5% left over for validation.
split_train_, split_valid_ = random_split(train_dataset, [num_train,num_val])

# Set to a DataLoader on the training data with batch_size BATCH_SIZE and specify collate_batch.
train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)


### Train the model

In [222]:
def train(dataloader, model, optimizer, criterion, epoch):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 200

    for idx, (label, text) in enumerate(dataloader):
        # Zero the gradients.
        optimizer.zero_grad()
        
        logits = model(text)
                
        # Get the loss.
        # need to check what label looks like 
        loss = criterion(logits,label)
        
        # Do back propagation.
        loss.backward()
        
        # Clip the gradients at 0.1
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        
        # Do an optimization step.
        optimizer.step()
        
        # Get the accuracy for this batch.
        predicted_labels = torch.argmax(logits, dim=1)
        total_acc += (predicted_labels == label).sum().item()
        # Get the number of rows in this batch. Use labels.
        total_count += len(label)
        
        if idx % log_interval == 0 and idx > 0:
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(epoch, idx, len(dataloader), total_acc / total_count)
            )
            total_acc, total_count = 0, 0

In [223]:
def evaluate(dataloader, model):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            logits = model(text)
            predicted_labels = torch.argmax(logits, dim=1)
            total_acc += (predicted_labels == label).sum().item()
            total_count += len(label)
    return total_acc / total_count

### Results

How do you do if you do not initialize with GloVe and use fine tuning so that you do optimize the embeddings?
- the learning is long and the model perform quite badly compared to the others -> test acc =  0.688

How do you do if you initialize with GloVe but turn off fine tuning so the embedding layer is static, it does not get optimized but you put in some good embeddings to start.
- the learning is super fast but the model do a little bit worse than the one with fineturning -> test acc = 0.865

How do you do if you initialize with GloVe and you turn on fine tuning?
- the learning is long but the results are pretty good -> test acc = 0.890

In [227]:
model = TextClassificationModel(vocab_size=len(vocab),
                                embed_dim=EMBED_DIM,
                                num_class=num_class,
                                initialize_with_glove=True,
                                fine_tune_embeddings=True)

# Set the optimizer for SGD with learning rate LR. The parameters are model.parameters.
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
# Schedule the learning rate decay to go down each epoch by 1/10.
scheduler =  torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1)

for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader, model, optimizer, criterion, epoch)
    accu_val = evaluate(valid_dataloader, model)
    scheduler.step()
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(epoch, time.time() - epoch_start_time, accu_val)
    )
    print("-" * 59)

print("Checking the results of test dataset.")
accu_test = evaluate(test_dataloader, model)
print("test accuracy {:8.3f}".format(accu_test))

| epoch   1 |   200/ 7125 batches | accuracy    0.565
| epoch   1 |   400/ 7125 batches | accuracy    0.705
| epoch   1 |   600/ 7125 batches | accuracy    0.759
| epoch   1 |   800/ 7125 batches | accuracy    0.769
| epoch   1 |  1000/ 7125 batches | accuracy    0.776
| epoch   1 |  1200/ 7125 batches | accuracy    0.799
| epoch   1 |  1400/ 7125 batches | accuracy    0.798
| epoch   1 |  1600/ 7125 batches | accuracy    0.802
| epoch   1 |  1800/ 7125 batches | accuracy    0.802
| epoch   1 |  2000/ 7125 batches | accuracy    0.801
| epoch   1 |  2200/ 7125 batches | accuracy    0.833
| epoch   1 |  2400/ 7125 batches | accuracy    0.812
| epoch   1 |  2600/ 7125 batches | accuracy    0.827
| epoch   1 |  2800/ 7125 batches | accuracy    0.838
| epoch   1 |  3000/ 7125 batches | accuracy    0.821
| epoch   1 |  3200/ 7125 batches | accuracy    0.824
| epoch   1 |  3400/ 7125 batches | accuracy    0.824
| epoch   1 |  3600/ 7125 batches | accuracy    0.833
| epoch   1 |  3800/ 7125 ba

In [226]:
model = TextClassificationModel(vocab_size=len(vocab),
                                embed_dim=EMBED_DIM,
                                num_class=num_class,
                                initialize_with_glove=True,
                                fine_tune_embeddings=False)

# Set the optimizer for SGD with learning rate LR. The parameters are model.parameters.
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
# Schedule the learning rate decay to go down each epoch by 1/10.
scheduler =  torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1)

for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader, model, optimizer, criterion, epoch)
    accu_val = evaluate(valid_dataloader, model)
    scheduler.step()
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(epoch, time.time() - epoch_start_time, accu_val)
    )
    print("-" * 59)

print("Checking the results of test dataset.")
accu_test = evaluate(test_dataloader, model)
print("test accuracy {:8.3f}".format(accu_test))

| epoch   1 |   200/ 7125 batches | accuracy    0.547
| epoch   1 |   400/ 7125 batches | accuracy    0.696
| epoch   1 |   600/ 7125 batches | accuracy    0.733
| epoch   1 |   800/ 7125 batches | accuracy    0.756
| epoch   1 |  1000/ 7125 batches | accuracy    0.762
| epoch   1 |  1200/ 7125 batches | accuracy    0.757
| epoch   1 |  1400/ 7125 batches | accuracy    0.773
| epoch   1 |  1600/ 7125 batches | accuracy    0.773
| epoch   1 |  1800/ 7125 batches | accuracy    0.781
| epoch   1 |  2000/ 7125 batches | accuracy    0.774
| epoch   1 |  2200/ 7125 batches | accuracy    0.775
| epoch   1 |  2400/ 7125 batches | accuracy    0.778
| epoch   1 |  2600/ 7125 batches | accuracy    0.773
| epoch   1 |  2800/ 7125 batches | accuracy    0.800
| epoch   1 |  3000/ 7125 batches | accuracy    0.793
| epoch   1 |  3200/ 7125 batches | accuracy    0.794
| epoch   1 |  3400/ 7125 batches | accuracy    0.796
| epoch   1 |  3600/ 7125 batches | accuracy    0.773
| epoch   1 |  3800/ 7125 ba

In [228]:
model = TextClassificationModel(vocab_size=len(vocab),
                                embed_dim=EMBED_DIM,
                                num_class=num_class,
                                initialize_with_glove=False,
                                fine_tune_embeddings=True)

# Set the optimizer for SGD with learning rate LR. The parameters are model.parameters.
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
# Schedule the learning rate decay to go down each epoch by 1/10.
scheduler =  torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1)

for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader, model, optimizer, criterion, epoch)
    accu_val = evaluate(valid_dataloader, model)
    scheduler.step()
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(epoch, time.time() - epoch_start_time, accu_val)
    )
    print("-" * 59)

print("Checking the results of test dataset.")
accu_test = evaluate(test_dataloader, model)
print("test accuracy {:8.3f}".format(accu_test))

| epoch   1 |   200/ 7125 batches | accuracy    0.377
| epoch   1 |   400/ 7125 batches | accuracy    0.478
| epoch   1 |   600/ 7125 batches | accuracy    0.488
| epoch   1 |   800/ 7125 batches | accuracy    0.492
| epoch   1 |  1000/ 7125 batches | accuracy    0.509
| epoch   1 |  1200/ 7125 batches | accuracy    0.500
| epoch   1 |  1400/ 7125 batches | accuracy    0.502
| epoch   1 |  1600/ 7125 batches | accuracy    0.484
| epoch   1 |  1800/ 7125 batches | accuracy    0.507
| epoch   1 |  2000/ 7125 batches | accuracy    0.501
| epoch   1 |  2200/ 7125 batches | accuracy    0.529
| epoch   1 |  2400/ 7125 batches | accuracy    0.507
| epoch   1 |  2600/ 7125 batches | accuracy    0.522
| epoch   1 |  2800/ 7125 batches | accuracy    0.533
| epoch   1 |  3000/ 7125 batches | accuracy    0.502
| epoch   1 |  3200/ 7125 batches | accuracy    0.523
| epoch   1 |  3400/ 7125 batches | accuracy    0.532
| epoch   1 |  3600/ 7125 batches | accuracy    0.533
| epoch   1 |  3800/ 7125 ba