In [1]:
from collections import Counter
import codecs
import itertools
from functools import reduce
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init
from torch.nn.utils.rnn import pack_padded_sequence
import torch.utils.data as data_utils
import time
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
#from torchsummary import summary

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
def read_words_tags(file, word_ind, tag_ind, prob_ind, caseless=True):
    
    with codecs.open(file, 'r', 'utf-8') as f:
        lines = f.readlines()
    #print(lines)
    words = []
    tags = []
    probs = []
    temp_w = []
    temp_t = []
    temp_p = []
    
    for line in lines:
        if not (line.isspace()):
            feats = line.strip().split()
            temp_w.append(feats[word_ind].lower() if caseless else feats[word_ind])
            temp_t.append(feats[tag_ind])
            temp_p.append((float)(feats[prob_ind]))
        elif len(temp_w) > 0:
            assert len(temp_w) == len(temp_t)
            words.append(temp_w)
            tags.append(temp_t)
            probs.append(temp_p)
            temp_w = []
            temp_t = []
            temp_p = []
            
    if len(temp_w) > 0:
        assert len(temp_w) == len(temp_t)
        words.append(temp_w)
        tags.append(temp_t)
        probs.append(temp_p)
            
    assert len(words) == len(tags) == len(probs)
    #print(probs)
    
    return words, tags, probs


In [4]:
train_file = "datasets/train.txt"
dev_file = "datasets/dev.txt"

word_index = 1
tag_index = 5
prob_index = 4

caseless=True

t_words , t_tags , t_probs = read_words_tags(train_file,word_index,tag_index,prob_index,caseless)
d_words , d_tags , d_probs = read_words_tags(dev_file,word_index,tag_index,prob_index,caseless)

In [5]:
def create_maps(words, tags, min_word_freq=5, min_char_freq=1):
    
    word_freq = Counter()
    char_freq = Counter()
    tag_map = set()
    for w, t in zip(words, tags):
        word_freq.update(w)
        char_freq.update(list(reduce(lambda x, y: list(x) + [' '] + list(y), w)))
        tag_map.update(t)
    #print(word_freq)
    
    word_map = {k: v + 1 for v, k in enumerate([w for w in word_freq.keys() if word_freq[w] > min_word_freq])}
    char_map = {k: v + 1 for v, k in enumerate([c for c in char_freq.keys() if char_freq[c] > min_char_freq])}
    tag_map = {k: v + 1 for v, k in enumerate(tag_map)}

    word_map['<pad>'] = 0
    word_map['<end>'] = len(word_map)
    word_map['<unk>'] = len(word_map)
    char_map['<pad>'] = 0
    char_map['<end>'] = len(char_map)
    char_map['<unk>'] = len(char_map)
    tag_map['<pad>'] = 0
    tag_map['<start>'] = len(tag_map)
    tag_map['<end>'] = len(tag_map)
    #print(word_map)
    
    return word_map, char_map, tag_map


In [6]:
min_word_freq=1
min_char_freq=1

word_map, char_map, tag_map = create_maps(t_words+d_words,t_tags+d_tags,min_word_freq, min_char_freq)

In [7]:
print(len(word_map))

2009


In [8]:
def create_input_tensors(words, tags, probs, word_map, char_map, tag_map):
   
    # Encode sentences into word maps with <end> at the end
    # [['dunston', 'checks', 'in', '<end>']] -> [[4670, 4670, 185, 4669]]
    wmaps = list(map(lambda s: list(map(lambda w: word_map.get(w, word_map['<unk>']), s)) + [word_map['<end>']], words))

    # Forward and backward character streams
    # [['d', 'u', 'n', 's', 't', 'o', 'n', ' ', 'c', 'h', 'e', 'c', 'k', 's', ' ', 'i', 'n', ' ']]
    chars_f = list(map(lambda s: list(reduce(lambda x, y: list(x) + [' '] + list(y), s)) + [' '], words))
    # [['n', 'i', ' ', 's', 'k', 'c', 'e', 'h', 'c', ' ', 'n', 'o', 't', 's', 'n', 'u', 'd', ' ']]
    chars_b = list(
        map(lambda s: list(reversed([' '] + list(reduce(lambda x, y: list(x) + [' '] + list(y), s)))), words))

    # Encode streams into forward and backward character maps with <end> at the end
    # [[29, 2, 12, 8, 7, 14, 12, 3, 6, 18, 1, 6, 21, 8, 3, 17, 12, 3, 60]]
    cmaps_f = list(
        map(lambda s: list(map(lambda c: char_map.get(c, char_map['<unk>']), s)) + [char_map['<end>']], chars_f))
    # [[12, 17, 3, 8, 21, 6, 1, 18, 6, 3, 12, 14, 7, 8, 12, 2, 29, 3, 60]]
    cmaps_b = list(
        map(lambda s: list(map(lambda c: char_map.get(c, char_map['<unk>']), s)) + [char_map['<end>']], chars_b))

    # Positions of spaces and <end> character
    # Words are predicted or encoded at these places in the language and tagging models respectively
    # [[7, 14, 17, 18]] are points after '...dunston', '...checks', '...in', '...<end>' respectively
    cmarkers_f = list(map(lambda s: [ind for ind in range(len(s)) if s[ind] == char_map[' ']] + [len(s) - 1], cmaps_f))
    # Reverse the markers for the backward stream before adding <end>, so the words of the f and b markers coincide
    # i.e., [[17, 9, 2, 18]] are points after '...notsnud', '...skcehc', '...ni', '...<end>' respectively
    cmarkers_b = list(
        map(lambda s: list(reversed([ind for ind in range(len(s)) if s[ind] == char_map[' ']])) + [len(s) - 1],
            cmaps_b))

    # Encode tags into tag maps with <end> at the end
    tmaps = list(map(lambda s: list(map(lambda t: tag_map[t], s)) + [tag_map['<end>']], tags))
    
    # Since we're using CRF scores of size (prev_tags, cur_tags), find indices of target sequence in the unrolled scores
    # This will be row_index (i.e. prev_tag) * n_columns (i.e. tagset_size) + column_index (i.e. cur_tag)
    #tmaps = list(map(lambda s: [tag_map['<start>'] * len(tag_map) + s[0]] + [s[i - 1] * len(tag_map) + s[i] for i in range(1, len(s))], tmaps))
    # Note - the actual tag indices can be recovered with tmaps % len(tag_map)

    # Pad, because need fixed length to be passed around by DataLoaders and other layers
    word_pad_len = max(list(map(lambda s: len(s), wmaps)))
    char_pad_len = max(list(map(lambda s: len(s), cmaps_f)))

    # Sanity check
    assert word_pad_len == max(list(map(lambda s: len(s), tmaps)))

    padded_wmaps = []
    padded_cmaps_f = []
    padded_cmaps_b = []
    padded_cmarkers_f = []
    padded_cmarkers_b = []
    padded_tmaps = []
    wmap_lengths = []
    cmap_lengths = []
    padded_probs = []

    for w, cf, cb, cmf, cmb, t,p in zip(wmaps, cmaps_f, cmaps_b, cmarkers_f, cmarkers_b, tmaps,probs):
        # Sanity  checks
        assert len(w) == len(cmf) == len(cmb) == len(t)
        assert len(cmaps_f) == len(cmaps_b)

        # Pad
        # A note -  it doesn't really matter what we pad with, as long as it's a valid index
        # i.e., we'll extract output at those pad points (to extract equal lengths), but never use them

        padded_wmaps.append(w + [word_map['<pad>']] * (word_pad_len - len(w)))
        padded_cmaps_f.append(cf + [char_map['<pad>']] * (char_pad_len - len(cf)))
        padded_cmaps_b.append(cb + [char_map['<pad>']] * (char_pad_len - len(cb)))

        # 0 is always a valid index to pad markers with (-1 is too but torch.gather has some issues with it)
        padded_cmarkers_f.append(cmf + [0] * (word_pad_len - len(w)))
        padded_cmarkers_b.append(cmb + [0] * (word_pad_len - len(w)))

        padded_tmaps.append(t + [tag_map['<pad>']] * (word_pad_len - len(t)))
        padded_probs.append(p + [0] * (word_pad_len - len(p)))

        wmap_lengths.append(len(w))
        cmap_lengths.append(len(cf))

        # Sanity check
        assert len(padded_wmaps[-1]) == len(padded_tmaps[-1]) == len(padded_cmarkers_f[-1]) == len(
            padded_cmarkers_b[-1]) == word_pad_len == len(padded_probs[-1])
        assert len(padded_cmaps_f[-1]) == len(padded_cmaps_b[-1]) == char_pad_len

    padded_wmaps = torch.LongTensor(padded_wmaps)
    padded_cmaps_f = torch.LongTensor(padded_cmaps_f)
    padded_cmaps_b = torch.LongTensor(padded_cmaps_b)
    padded_cmarkers_f = torch.LongTensor(padded_cmarkers_f)
    padded_cmarkers_b = torch.LongTensor(padded_cmarkers_b)
    padded_tmaps = torch.LongTensor(padded_tmaps)
    wmap_lengths = torch.LongTensor(wmap_lengths)
    cmap_lengths = torch.LongTensor(cmap_lengths)
    padded_probs = torch.FloatTensor(padded_probs)
    
    return padded_wmaps, padded_cmaps_f, padded_cmaps_b, padded_cmarkers_f, padded_cmarkers_b, padded_tmaps, wmap_lengths, cmap_lengths , padded_probs

In [9]:
batch_size = 10
workers = 1

padded_wmaps, padded_cmaps_f, padded_cmaps_b, padded_cmarkers_f, padded_cmarkers_b, padded_tmaps, wmap_lengths, cmap_lengths , padded_probs = create_input_tensors(t_words, t_tags,t_probs, word_map, char_map, tag_map)
#print(wmap_lengths)
t_inputs = data_utils.TensorDataset(padded_wmaps, padded_cmaps_f, padded_cmaps_b, padded_cmarkers_f, padded_cmarkers_b, padded_tmaps, wmap_lengths, cmap_lengths , padded_probs)
train_loader = torch.utils.data.DataLoader(t_inputs, batch_size = batch_size, shuffle=True, num_workers=workers, pin_memory=False)

padded_wmaps, padded_cmaps_f, padded_cmaps_b, padded_cmarkers_f, padded_cmarkers_b, padded_tmaps, wmap_lengths, cmap_lengths , padded_probs = create_input_tensors(d_words, d_tags,d_probs, word_map, char_map, tag_map)
d_inputs = data_utils.TensorDataset(padded_wmaps, padded_cmaps_f, padded_cmaps_b, padded_cmarkers_f, padded_cmarkers_b, padded_tmaps, wmap_lengths, cmap_lengths , padded_probs)
val_loader = torch.utils.data.DataLoader(d_inputs, batch_size = batch_size, shuffle=True, num_workers=workers, pin_memory=False)

In [10]:
def init_embedding(input_embedding):
    """
    Initialize embedding tensor with values from the uniform distribution.
    :param input_embedding: embedding tensor
    :return:
    """
    bias = np.sqrt(3.0 / input_embedding.size(1))
    nn.init.uniform_(input_embedding, -bias, bias)

def load_embeddings(emb_file, word_map, expand_vocab=True):
    """
    Load pre-trained embeddings for words in the word map.
    :param emb_file: file with pre-trained embeddings (in the GloVe format)
    :param word_map: word map
    :param expand_vocab: expand vocabulary of word map to vocabulary of pre-trained embeddings?
    :return: embeddings for words in word map, (possibly expanded) word map,
            number of words in word map that are in-corpus (subject to word frequency threshold)
    """
    with open(emb_file, 'r') as f:
        emb_len = len(f.readline().split(' ')) - 1

    print("Embedding length is %d." % emb_len)

    # Create tensor to hold embeddings for words that are in-corpus
    ic_embs = torch.FloatTensor(len(word_map), emb_len)
    init_embedding(ic_embs)

    if expand_vocab:
        print("You have elected to include embeddings that are out-of-corpus.")
        ooc_words = []
        ooc_embs = []
    else:
        print("You have elected NOT to include embeddings that are out-of-corpus.")

    # Read embedding file
    print("\nLoading embeddings...")
    for line in open(emb_file, 'r',encoding="utf8"):
        line = line.split(' ')
        emb_word = line[0]
        embedding = list(map(lambda t: float(t), filter(lambda n: n and not n.isspace(), line[1:])))

        if not expand_vocab and emb_word not in word_map:
            continue

        # If word is in train_vocab, store at the correct index (as in the word_map)
        if emb_word in word_map:
            ic_embs[word_map[emb_word]] = torch.FloatTensor(embedding)

        # If word is in dev or test vocab, store it and its embedding into lists
        elif expand_vocab:
            ooc_words.append(emb_word)
            ooc_embs.append(embedding)

    lm_vocab_size = len(word_map)  # keep track of lang. model's output vocab size (no out-of-corpus words)

    if expand_vocab:
        print("'word_map' is being updated accordingly.")
        for word in ooc_words:
            word_map[word] = len(word_map)
        ooc_embs = torch.FloatTensor(np.asarray(ooc_embs))
        embeddings = torch.cat([ic_embs, ooc_embs], 0)

    else:
        embeddings = ic_embs

    # Sanity check
    assert embeddings.size(0) == len(word_map)

    print("\nDone.\n Embedding vocabulary: %d\n Language Model vocabulary: %d.\n" % (len(word_map), lm_vocab_size))

    return embeddings, word_map, lm_vocab_size

In [None]:
emb_file = "glove.6B/glove.6B.50d.txt"
expand_vocab = False
word_emb_dim = 50

embeddings, word_map, lm_vocab_size = load_embeddings(emb_file, word_map,expand_vocab)


Embedding length is 50.
You have elected NOT to include embeddings that are out-of-corpus.

Loading embeddings...


In [32]:
print(embeddings.size())
print(lm_vocab_size)

torch.Size([2009, 50])
2009


In [60]:
class Attention(nn.Module):
    """Attention mechanism written by Gustavo Aguilar https://github.com/gaguilar"""
    def __init__(self,  hidden_size):
        super(Attention, self).__init__()
        self.da = hidden_size
        self.dh = hidden_size

        self.W = nn.Linear(self.dh, self.da)        # (feat_dim, attn_dim)
        self.v = nn.Linear(self.da, 1)              # (attn_dim, 1)

    def forward(self, inputs, mask):
        # Raw scores
        u = self.v(torch.tanh(self.W(inputs)))      # (batch, seq, hidden) -> (batch, seq, attn) -> (batch, seq, 1)

        # Masked softmax
        u = u.exp()                                 # exp to calculate softmax
        u = mask.unsqueeze(2).float() * u           # (batch, seq, 1) * (batch, seq, 1) to zerout out-of-mask numbers
        sums = torch.sum(u, dim=1, keepdim=True)    # now we are sure only in-mask values are in sum
        a = u / sums                                # the probability distribution only goes to in-mask values now

        # Weighted vectors
        z = inputs * a

        return  z,  a.view(inputs.size(0), inputs.size(1))


class LM_LSTM_CRF(nn.Module):

    def __init__(self, target_size, hidden_size, vocab_size, word_emb_dim, word_rnn_dim, dropout):
        
        super(LM_LSTM_CRF, self).__init__()

        self.target_size = target_size  # this is the size of the output vocab of the tagging model
        self.hidden_size = hidden_size

        self.wordset_size = lm_vocab_size  # this is the size of the input vocab (embedding layer) of the tagging model
        self.word_emb_dim = word_emb_dim
        self.word_rnn_dim = word_rnn_dim
        self.word_rnn_layers = 2

        self.dropout = nn.Dropout(p=dropout)

        self.word_embeds = nn.Embedding(self.wordset_size, self.word_emb_dim)  # word embedding layer
        self.word_blstm = nn.LSTM(self.word_emb_dim, self.word_rnn_dim, num_layers=self.word_rnn_layers, bidirectional=True, dropout=dropout)  # word BLSTM
        
        self.attention = Attention(self.word_rnn_dim*2)
        
        self.fc1 = nn.Linear(self.word_rnn_dim*2, self.hidden_size)  
        self.fc2 = nn.Linear(self.hidden_size, self.target_size)
        
    def init_word_embeddings(self, embeddings):
        """
        Initialize embeddings with pre-trained embeddings.
        :param embeddings: pre-trained embeddings
        """
        self.word_embeds.weight = nn.Parameter(embeddings)

    def fine_tune_word_embeddings(self, fine_tune=False):
        """
        Fine-tune embedding layer? (Not fine-tuning only makes sense if using pre-trained embeddings).
        :param fine_tune: Fine-tune?
        """
        for p in self.word_embeds.parameters():
            p.requires_grad = fine_tune

    def forward(self, wmaps, probs, wmap_lengths):
        
        self.batch_size = wmaps.size(0)
        self.word_pad_len = wmaps.size(1)

        # Sort by decreasing true word sequence length
        wmap_lengths, word_sort_ind = wmap_lengths.sort(dim=0, descending=True)
        wmaps = wmaps[word_sort_ind]
        probs = probs[word_sort_ind]
        
        # Embedding look-up for words
        w = self.word_embeds(wmaps)  # (batch_size, word_pad_len, word_emb_dim)
        w = self.dropout(w)

        # Pack padded sequence
        w = pack_padded_sequence(w, list(wmap_lengths), batch_first=True)  # packed sequence of word_emb_dim + 2 * char_rnn_dim, with real sequence lengths
        
        # LSTM
        w, _ = self.word_blstm(w)  # packed sequence of word_rnn_dim, with real sequence lengths

        # Unpack packed sequence
        w, _ = pad_packed_sequence(w, batch_first=True)  # (batch_size, max_word_len_in_batch, word_rnn_dim)
        w = self.dropout(w)
        
        mask = []
        for i in range(len(wmap_lengths)):
            mask_row = (np.concatenate([np.ones(wmap_lengths[i]),np.zeros(len(wmaps[i])-wmap_lengths[i])])).tolist()
            mask.append(mask_row)
        #Attention
        att_output, att_weights = self.attention(w, torch.from_numpy(np.asarray(mask)).float())
        
        # fc layers
        w = torch.relu(self.fc1(att_output)) 
        w = self.dropout(w)
        
        # final score
        scores = torch.sigmoid(self.fc2(w)) 
                
        return scores, wmaps, probs, wmap_lengths, word_sort_ind

In [61]:
word_rnn_dim = 50 
dropout = 0.3
fine_tune_word_embeddings = False

model = LM_LSTM_CRF(1, 20, lm_vocab_size, word_emb_dim, word_rnn_dim, dropout)
print(model)
        
model.init_word_embeddings(embeddings.to(device))  # initialize embedding layer with pre-trained embeddings
model.fine_tune_word_embeddings(fine_tune_word_embeddings)  # fine-tune
optimizer = optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()), lr=0.0001)

loss_fn = nn.BCELoss().to(device)

LM_LSTM_CRF(
  (dropout): Dropout(p=0.3, inplace=False)
  (word_embeds): Embedding(2009, 50)
  (word_blstm): LSTM(50, 50, num_layers=2, dropout=0.3, bidirectional=True)
  (attention): Attention(
    (W): Linear(in_features=100, out_features=100, bias=True)
    (v): Linear(in_features=100, out_features=1, bias=True)
  )
  (fc1): Linear(in_features=100, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=1, bias=True)
)


In [62]:
class AverageMeter(object):
    """
    Keeps track of most recent, average, sum, and count of a metric.
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [63]:
def train(train_loader, model, loss_fn, optimizer, epoch, print_freq = 25):
    
    model.train()  # training mode enables dropout

    batch_time = AverageMeter()  # forward prop. + back prop. time per batch
    data_time = AverageMeter()  # data loading time per batch
    losses = AverageMeter()  # cross entropy loss
    f1s = AverageMeter()  # f1 score

    start = time.time()

    # Batches
    for i, (wmaps, cmaps_f, cmaps_b, cmarkers_f, cmarkers_b, tmaps, wmap_lengths, cmap_lengths, probs) in enumerate(train_loader):
        
        data_time.update(time.time() - start)
        max_word_len = max(wmap_lengths.tolist())

        # Reduce batch's padded length to maximum in-batch sequence
        # This saves some compute on nn.Linear layers (RNNs are unaffected, since they don't compute over the pads)
        wmaps = wmaps[:, :max_word_len].to(device)
        probs = probs[:, :max_word_len].to(device)
        wmap_lengths = wmap_lengths.to(device)
        
        
        # Forward prop.
        scores, wmaps_sorted, probs_sorted, wmap_lengths_sorted, _ = model(wmaps, probs, wmap_lengths)
               
        # We don't predict the next word at the pads or <end> tokens
        # We will only predict at [dunston, checks, in] among [dunston, checks, in, <end>, <pad>, <pad>, ...]
        # So, prediction lengths are word sequence lengths - 1
        lm_lengths = wmap_lengths_sorted - 1
        lm_lengths = lm_lengths.tolist()
        
        # loss
        probs_sorted.resize_(scores.size())  
        
        scores = pack_padded_sequence(scores, lm_lengths, batch_first=True).data
        targets = pack_padded_sequence(probs_sorted, lm_lengths, batch_first=True).data
        loss = loss_fn(scores,targets)

        # Back prop.
        optimizer.zero_grad()
        loss.backward()

#         grad_clip = True
#         if grad_clip is not None:
#             clip_gradient(optimizer, grad_clip)

        optimizer.step()

        # Keep track of metrics
        losses.update(loss.item(), sum(lm_lengths))
        batch_time.update(time.time() - start)

        start = time.time()

        # Print training status
        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(epoch, i, len(train_loader),
                                                          batch_time=batch_time,
                                                          data_time=data_time, loss=losses))



In [64]:
epochs = 20
for epoch in range(0, epochs):

        # One epoch's training
        train(train_loader, model, loss_fn, optimizer, epoch)
        print("\n")

Epoch: [0][0/275]	Batch Time 3.110 (3.110)	Data Load Time 2.634 (2.634)	Loss 0.6661 (0.6661)	
Epoch: [0][25/275]	Batch Time 0.085 (0.244)	Data Load Time 0.001 (0.103)	Loss 0.6755 (0.6672)	
Epoch: [0][50/275]	Batch Time 0.080 (0.160)	Data Load Time 0.001 (0.053)	Loss 0.6636 (0.6662)	
Epoch: [0][75/275]	Batch Time 0.047 (0.126)	Data Load Time 0.000 (0.036)	Loss 0.6593 (0.6658)	
Epoch: [0][100/275]	Batch Time 0.070 (0.110)	Data Load Time 0.001 (0.027)	Loss 0.6569 (0.6647)	
Epoch: [0][125/275]	Batch Time 0.057 (0.101)	Data Load Time 0.002 (0.022)	Loss 0.6469 (0.6633)	
Epoch: [0][150/275]	Batch Time 0.071 (0.095)	Data Load Time 0.001 (0.018)	Loss 0.6398 (0.6613)	
Epoch: [0][175/275]	Batch Time 0.065 (0.090)	Data Load Time 0.001 (0.016)	Loss 0.6354 (0.6591)	
Epoch: [0][200/275]	Batch Time 0.051 (0.086)	Data Load Time 0.002 (0.014)	Loss 0.6291 (0.6562)	
Epoch: [0][225/275]	Batch Time 0.061 (0.084)	Data Load Time 0.001 (0.013)	Loss 0.6149 (0.6530)	
Epoch: [0][250/275]	Batch Time 0.055 (0.082)	

Epoch: [7][225/275]	Batch Time 0.058 (0.080)	Data Load Time 0.001 (0.004)	Loss 0.5197 (0.5516)	
Epoch: [7][250/275]	Batch Time 0.073 (0.078)	Data Load Time 0.001 (0.004)	Loss 0.5537 (0.5507)	


Epoch: [8][0/275]	Batch Time 1.098 (1.098)	Data Load Time 0.895 (0.895)	Loss 0.5367 (0.5367)	
Epoch: [8][25/275]	Batch Time 0.058 (0.216)	Data Load Time 0.001 (0.036)	Loss 0.6219 (0.5398)	
Epoch: [8][50/275]	Batch Time 0.063 (0.143)	Data Load Time 0.002 (0.019)	Loss 0.5645 (0.5445)	
Epoch: [8][75/275]	Batch Time 0.059 (0.117)	Data Load Time 0.000 (0.013)	Loss 0.6175 (0.5475)	
Epoch: [8][100/275]	Batch Time 0.054 (0.104)	Data Load Time 0.000 (0.010)	Loss 0.5430 (0.5483)	
Epoch: [8][125/275]	Batch Time 0.055 (0.096)	Data Load Time 0.001 (0.008)	Loss 0.5658 (0.5496)	
Epoch: [8][150/275]	Batch Time 0.057 (0.091)	Data Load Time 0.001 (0.007)	Loss 0.5800 (0.5495)	
Epoch: [8][175/275]	Batch Time 0.070 (0.087)	Data Load Time 0.001 (0.006)	Loss 0.5207 (0.5493)	
Epoch: [8][200/275]	Batch Time 0.065 (0.084

Epoch: [15][150/275]	Batch Time 0.071 (0.078)	Data Load Time 0.001 (0.006)	Loss 0.5754 (0.5360)	
Epoch: [15][175/275]	Batch Time 0.069 (0.076)	Data Load Time 0.000 (0.005)	Loss 0.5061 (0.5358)	
Epoch: [15][200/275]	Batch Time 0.075 (0.076)	Data Load Time 0.001 (0.004)	Loss 0.5072 (0.5364)	
Epoch: [15][225/275]	Batch Time 0.061 (0.075)	Data Load Time 0.001 (0.004)	Loss 0.5522 (0.5373)	
Epoch: [15][250/275]	Batch Time 0.078 (0.075)	Data Load Time 0.001 (0.004)	Loss 0.5318 (0.5369)	


Epoch: [16][0/275]	Batch Time 0.918 (0.918)	Data Load Time 0.838 (0.838)	Loss 0.4992 (0.4992)	
Epoch: [16][25/275]	Batch Time 0.068 (0.108)	Data Load Time 0.001 (0.033)	Loss 0.5417 (0.5397)	
Epoch: [16][50/275]	Batch Time 0.060 (0.087)	Data Load Time 0.001 (0.017)	Loss 0.5870 (0.5379)	
Epoch: [16][75/275]	Batch Time 0.074 (0.081)	Data Load Time 0.001 (0.012)	Loss 0.5071 (0.5336)	
Epoch: [16][100/275]	Batch Time 0.057 (0.077)	Data Load Time 0.001 (0.009)	Loss 0.5371 (0.5335)	
Epoch: [16][125/275]	Batch Time 0

In [65]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

def fix_padding(scores_numpy, label_probs,  mask_numpy):
    #if len(scores_numpy) != len(mask_numpy):
    #    print("Error: len(scores_numpy) != len(mask_numpy)")
    #assert len(scores_numpy) == len(mask_numpy)
    #if len(label_probs) != len(mask_numpy):
    #    print("len(label_probs) != len(mask_numpy)")
    #assert len(label_probs) == len(mask_numpy)

    all_scores_no_padd = []
    all_labels_no_pad = []
    for i in range(len(mask_numpy)):
        all_scores_no_padd.append(scores_numpy[i][:mask_numpy[i]])
        all_labels_no_pad.append(label_probs[i][:mask_numpy[i]])

    assert len(all_scores_no_padd) == len(all_labels_no_pad)
    return all_scores_no_padd, all_labels_no_pad

def match_M(batch_scores_no_padd, batch_labels_no_pad):

    top_m = [1, 2, 3, 4]
    batch_num_m=[]
    batch_score_m=[]
    for m in top_m:
        intersects_lst = []
        # exact_lst = []
        score_lst = []
        ############################################### computing scores:
        for s in batch_scores_no_padd:
            if len(s) <=m:
                continue
            h = m
            # if len(s) > h:
            #     while (s[np.argsort(s)[-h]] == s[np.argsort(s)[-(h + 1)]] and h < (len(s) - 1)):
            #         h += 1

            s = np.asarray(s)
            #ind_score = np.argsort(s)[-h:]
            ind_score = sorted(range(len(s)), key = lambda sub: s[sub])[-h:]
            score_lst.append(ind_score)

        ############################################### computing labels:
        label_lst = []
        for l in batch_labels_no_pad:
            if len(l) <=m:
                continue
            # if it contains several top values with the same amount
            h = m
            if len(l) > h:
                while (l[np.argsort(l)[-h]] == l[np.argsort(l)[-(h + 1)]] and h < (len(l) - 1)):
                    h += 1
            l = np.asarray(l)
            ind_label = np.argsort(l)[-h:]
            label_lst.append(ind_label)

        ############################################### :

        for i in range(len(score_lst)):
            intersect = intersection(score_lst[i], label_lst[i])
            intersects_lst.append((len(intersect))/(min(m, len(score_lst[i]))))
            # sorted_score_lst = sorted(score_lst[i])
            # sorted_label_lst =  sorted(label_lst[i])
            # if sorted_score_lst==sorted_label_lst:
            #     exact_lst.append(1)
            # else:
            #     exact_lst.append(0)
        batch_num_m.append(len(score_lst))
        batch_score_m.append(sum(intersects_lst))
    return batch_num_m, batch_score_m


#Validation
num_m = [0, 0, 0, 0]
score_m = [0, 0, 0, 0]
with torch.no_grad():
    for i, (wmaps, cmaps_f, cmaps_b, cmarkers_f, cmarkers_b, tmaps, wmap_lengths, cmap_lengths, probs) in enumerate(val_loader):


            max_word_len = max(wmap_lengths.tolist())

            # Reduce batch's padded length to maximum in-batch sequence
            # This saves some compute on nn.Linear layers (RNNs are unaffected, since they don't compute over the pads)
            wmaps = wmaps[:, :max_word_len].to(device)
            probs = probs[:, :max_word_len].to(device)
            wmap_lengths = wmap_lengths.to(device)


            # Forward prop.
            scores, wmaps_sorted, probs_sorted, wmap_lengths_sorted, _ = model(wmaps, probs, wmap_lengths)
            lm_lengths = wmap_lengths_sorted - 1
            lm_lengths = lm_lengths.tolist()
            batch_scores_no_padd, batch_labels_no_pad = fix_padding(scores, probs_sorted, lm_lengths)
            #print(type(batch_scores_no_padd))
            batch_num_m, batch_score_m = match_M(batch_scores_no_padd, batch_labels_no_pad)
            num_m = [sum(i) for i in zip(num_m, batch_num_m)]
            score_m = [sum(i) for i in zip(score_m, batch_score_m)]
            
    m_score = [i/j for i,j in zip(score_m, num_m)]
    print(m_score)
        

[0.42091836734693877, 0.603448275862069, 0.6988636363636362, 0.7318181818181818]
