### Looking at the data 

* Note the use of the %%bash command to issue a shell command in linux

In [1]:
%%bash
head -20 f30kE-captions-bio.txt

A B
group I
of I
5 I
scuba I
divers I
talk O
on O
the B
surface I
next O
to O
a B
barrier I
island I
. O

Man B
with O
long B


### 1. Pre-processing

### Creating a list of lists of tokens (one list per sentence) and the corresponding lists of labels



In [2]:
 # YOUR CODE HERE
texts = []
labels = []

with open('f30kE-captions-bio.txt') as fp:
    text = []
    label = []
    for line in fp:
        tokens = line.strip().split()
        if len(tokens) == 0:
            texts.append(text)
            labels.append(label)
            text = []
            label = []
        else:
            text.append(tokens[0])
            label.append(tokens[1])
print(len(texts))

print(texts[1])
print(labels[1])

5500
['Man', 'with', 'long', 'blond-hair', 'and', 'a', 'plate', 'of', 'food', 'in', 'his', 'hands', '.']
['B', 'O', 'B', 'I', 'O', 'B', 'I', 'I', 'I', 'O', 'B', 'I', 'O']


### Mapping labels to integers and sequence of labels to sequence of integers

In [3]:
# YOUR CODE HERE
import collections 
label2int = collections.defaultdict(lambda: len(label2int))
label2int['<eos>'] = 0
# convert the label sequences to sequences of integers
int_labels = []
for label in labels:
    int_labels.append([label2int[token] for token in label])
    
print("there are "+str(len(label2int))+ " distinct labels in the corpus")
print("there are "+str(len(int_labels))+" sentences in the corpus")

print(str(len(int_labels[0])))
print(labels[0])
print(int_labels[0])


there are 4 distinct labels in the corpus
there are 5500 sentences in the corpus
16
['B', 'I', 'I', 'I', 'I', 'I', 'O', 'O', 'B', 'I', 'O', 'O', 'B', 'I', 'I', 'O']
[1, 2, 2, 2, 2, 2, 3, 3, 1, 2, 3, 3, 1, 2, 2, 3]


### Convert the tokens to integers

In [4]:
# YOUR CODE HERE
import collections 
token2int = collections.defaultdict(lambda: len(token2int))
token2int['<eos>'] = 0
#convert the texts to sequences of integers
int_texts = []
for text in texts:
    int_texts.append([token2int[token.lower()] for token in text])
    
print("there are "+str(len(token2int))+ " distinct labels in the corpus")
print("there are "+str(len(int_texts))+" sentences in the corpus")

print(texts[1])
print(int_texts[1])
print(labels[1])
print(int_labels[1])

there are 4596 distinct labels in the corpus
there are 5500 sentences in the corpus
['Man', 'with', 'long', 'blond-hair', 'and', 'a', 'plate', 'of', 'food', 'in', 'his', 'hands', '.']
[16, 17, 18, 19, 20, 1, 21, 3, 22, 23, 24, 25, 15]
['B', 'O', 'B', 'I', 'O', 'B', 'I', 'I', 'I', 'O', 'B', 'I', 'O']
[1, 3, 1, 2, 3, 1, 2, 2, 2, 3, 1, 2, 3]


### Create the reverse dictionaries (int2label, int2token) to map integer labels and integer tokens back to labels and tokens 

In [5]:
# YOUR CODE HERE
int2label = {y: x for x, y in label2int.items()}
int2token = {y: x for x, y in token2int.items()}

## 2. Creating the training and validation data

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

max_len = 16
batch_size = 64
embed_size = 300
hidden_size = 128

#### Creating tensors

In [7]:
# YOUR CODE HERE
X = torch.zeros(len(int_texts), max_len).long()
Y = torch.zeros(len(int_labels), max_len).long()

for i, (text,label) in enumerate(zip(int_texts, int_labels)):
    length = min(max_len, len(text))
    X[i,:length] = torch.LongTensor(text[:length])
    Y[i,:length] = torch.LongTensor(label[:length])
    
print(X[12])
print(Y[12])

tensor([ 1, 16, 85, 86,  1, 87, 88, 36, 89,  1, 90, 15,  0,  0,  0,  0])
tensor([1, 2, 3, 3, 1, 2, 2, 3, 3, 1, 2, 3, 0, 0, 0, 0])


#### Create train and validation data



In [8]:
# YOUR CODE HERE
X_train = X[:5000]
Y_train = Y[:5000]
X_valid = X[:5000]
Y_valid = Y[:5000]

#### Use torch DataLoader to split training and validation data into batches



In [9]:
# YOUR CODE HERE
from torch.utils.data import TensorDataset, DataLoader
train_set = TensorDataset(X_train, Y_train)
valid_set = TensorDataset(X_valid, Y_valid)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size)

## Using pre-trained Fasttext embeddings


In [47]:
# YOUR CODE HERE
pretrained_weights = torch.zeros(len(token2int), 300)
with open('wiki.en.filtered.vec') as fp:
    for line in fp:
        tokens = line.strip().split()
        if tokens[0] in token2int:
            # map index in token2int to glove embedding of corresponding token
            pretrained_weights[token2int[tokens[0]]] = torch.FloatTensor([float(x) for x in tokens[1:]])

pretrained_weights[12][:10]

tensor([-0.0159,  0.2218, -0.0831,  0.3253, -0.0533,  0.1563,  0.3345,  0.0572,
        -0.1407, -0.0389])

## Create, train and evaluate your neural network

In [49]:
class RNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed = nn.Embedding(len(token2int), embed_size, padding_idx=token2int ['<eos>'])
        self.embed.weight = nn.Parameter(pretrained_weights, requires_grad=False)
        self.rnn = nn.GRU(embed_size, hidden_size, bias=False, num_layers=1, bidirectional=False, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.decision = nn.Linear(hidden_size * 1 * 1, len(label2int))
        
    def forward(self, x):
        embed = self.embed(x)
        output, hidden = self.rnn(embed)
        return self.decision(self.dropout(output))

rnn_model = RNN()
rnn_model

RNN(
  (embed): Embedding(4596, 300, padding_idx=0)
  (rnn): GRU(300, 128, bias=False, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (decision): Linear(in_features=128, out_features=4, bias=True)
)

In [50]:
# Checking the size (batch_size, sequence_length, num_labels) of the output

with torch.no_grad():
  print(rnn_model(X[:2]).size())

torch.Size([2, 16, 4])


In [51]:
def perf(model, loader):
    criterion = nn.CrossEntropyLoss()
    model.eval()
    total_loss = correct = num_loss = num_perf = 0
    for x, y in loader:
      with torch.no_grad():
        y_scores = model(x)
        loss = criterion(y_scores.view(y.size(0) * y.size(1), -1), y.view(y.size(0) * y.size(1)))
        y_pred = torch.max(y_scores, 2)[1]
        mask = (y != 0)
        correct += torch.sum((y_pred.data == y) * mask)
        total_loss += loss.item()
        num_loss += len(y)
        num_perf += torch.sum(mask).item()
    return total_loss / num_loss, correct.item() / num_perf

perf(rnn_model, valid_loader)

(0.02180137758255005, 0.3628871889499991)

#### Define the training function 

In [52]:
def fit(model, epochs):
    criterion = nn.CrossEntropyLoss()
    # add gradients to all parameters (required by pytorch for training)
    optimizer = optim.Adam(filter(lambda param: param.requires_grad, model.parameters()))
    for epoch in range(epochs):
        model.train()
        total_loss = num = 0
        for x, y in train_loader:
            optimizer.zero_grad()
            y_scores = model(x)
            loss = criterion(y_scores.view(y.size(0) * y.size(1), -1), y.view(y.size(0) * y.size(1)))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            num += len(y)
        print(epoch, total_loss / num, *perf(model, valid_loader))

rnn_model = RNN()
fit(rnn_model, 10)

0 0.008177814197540283 0.002274954153597355 0.931044984682635
1 0.0015918059520423412 0.0011125849187374115 0.9697056557802899
2 0.0010381606325507164 0.0008231632005423307 0.9770329099410595
3 0.0008425289627164602 0.0007026651084423066 0.9807234095917161
4 0.0007496314894407987 0.0006334372773766517 0.9826761496981314
5 0.0006813538726419211 0.0005893108481541276 0.9839302029774808
6 0.0006317454492673278 0.0005267989452928304 0.9853275766316129
7 0.0005821802375838161 0.0005016943197697401 0.9863845643956359
8 0.0005378525465726852 0.00045420381259173156 0.9875311273938981
9 0.0004940528498962521 0.0003972992794588208 0.9897525932030312
