<a href="https://colab.research.google.com/github/Sylar257/Sequence_Labeling_Project/blob/master/Sequence_labeling_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from utils import *
import torch.nn.functional as F
decive = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
class Highway(nn.Module):
    """
    Highway network
    """
    def __init__(self, size, num_layers=1, dropout=0.5):
        """
        size: size of Linear layer (should match input size)
        num_layers: number of transform and gate layers
        dropout: dropout rate
        """
        super(Highway, self).__init__()
        self.size = size
        self.num_layers = num_layers
        self.transform = nn.ModuleList() # A list of transform layers
        self.gate = nn.ModuleList()      # A list of gate layers
        self.dropout = torch.nn.Dropout(p=dropout)

        for i in range(num_layers):
            transform = nn.Linear(size, size)
            gate = nn.Linear(size, size)
            self.transform.append(transform)
            self.gate.append(gate)
    def forward(self, x):
        """
        Forward-prop.
        Returns a tensor with the same dimensions as input tensor
        """

        transformed = F.relu(self.transform[0](x))  # transform with the first transform layer
        g = F.sigmoid(self.gate[0](x))              # calculate how much of the transformed input to keep

        out = self.dropout(g*transformed + (1-g)*x)               # combine input and transformed input with ratio of g

        # If there are additional layers
        for i in range(self.num_layers):
            transformed = F.relu(self.transform[i](out))
            g = F.sigmoid(self.gate[i](out))
            out = self.dropout(g*transformed+(1-g)*out)

        return out

In [0]:
class CRF(nn.Module):
    """
    Confitional Random Field
    """

    def __init__(self, hidden_dim, tagset_size):
        """
        hidden_dim: the size of word/BLSTM's output (which is the input size for CRF)
        tagset_size: number of tags(depending on our dataset)
        """

        super(CRF, self).__init__()
        self.tagset_size = tagset_size
        self.emission = nn.Linear(hidden_dim, self.tagset_size)
        self.transition = nn.Parameter(torch.Tensor(self.tagset_size, self.tagset_size))
        self.transition.data.zero_() # initializa the transition matrix to be all zeros

    def forward(self, feats):
        """
        feats:   output of word/BLSTM, a tensor of dimensions-(batch_size, timesteps, hidden_dim)
        returns: CRF scores, a tensor of dimensions-(batch_size, timesteps, tagset_size, tagset_size)
        """
        self.batch_size = feats.size(0)
        self.timesteps  = feats.size(1)

        emission_scores  = self.emission(feats)  # (batch_size, timesteps, tagset_size)
        # here we broadcast emission_score in order to compute the total score later with transition score
        emission_scores  = emission_score.unsqueeze(2).expand(self.batch_size, self.timesteps, self.tagset_size,
                                                              self.tagset_size)  # (batch_size, timesteps, tagset_size, tagset_size)

        crf_scores = emission_scores + self.transition.unsqueeze(0).unsqueeze(0)  # (batch_size, timesteps, tagset_size, tagset_size)
        return crf_scores

In [0]:
class ViterbiLoss(nn.Module):
    """
    Viterbi Loss.
    """

    def __init__(self, tag_map):
        """
        :param tag_map: tag map
        """
        super(ViterbiLoss, self).__init__()
        self.tagset_size = len(tag_map)
        self.start_tag = tag_map['<start>']
        self.end_tag = tag_map['<end>']

    def forward(self, scores, targets, lengths):
        """
        Forward propagation.
        :param scores: CRF scores
        :param targets: true tags indices in unrolled CRF scores
        :param lengths: word sequence lengths
        :return: viterbi loss
        """

        batch_size = scores.size(0)
        word_pad_len = scores.size(1)

        # Gold score

        targets = targets.unsqueeze(2)
        scores_at_targets = torch.gather(scores.view(batch_size, word_pad_len, -1), 2, targets).squeeze(
            2)  # (batch_size, word_pad_len)

        # Everything is already sorted by lengths
        scores_at_targets, _ = pack_padded_sequence(scores_at_targets, lengths, batch_first=True)
        gold_score = scores_at_targets.sum()

        # All paths' scores

        # Create a tensor to hold accumulated sequence scores at each current tag
        scores_upto_t = torch.zeros(batch_size, self.tagset_size).to(device)

        for t in range(max(lengths)):
            batch_size_t = sum([l > t for l in lengths])  # effective batch size (sans pads) at this timestep
            if t == 0:
                scores_upto_t[:batch_size_t] = scores[:batch_size_t, t, self.start_tag, :]  # (batch_size, tagset_size)
            else:
                # We add scores at current timestep to scores accumulated up to previous timestep, and log-sum-exp
                # Remember, the cur_tag of the previous timestep is the prev_tag of this timestep
                # So, broadcast prev. timestep's cur_tag scores along cur. timestep's cur_tag dimension
                scores_upto_t[:batch_size_t] = log_sum_exp(
                    scores[:batch_size_t, t, :, :] + scores_upto_t[:batch_size_t].unsqueeze(2),
                    dim=1)  # (batch_size, tagset_size)

        # We only need the final accumulated scores at the <end> tag
        all_paths_scores = scores_upto_t[:, self.end_tag].sum()

        viterbi_loss = all_paths_scores - gold_score
        viterbi_loss = viterbi_loss / batch_size

        return viterbi_loss

In [0]:
class LM_LSTM_CRF(nn.Module):
    """
    The encompassing LM-LSTM-CRF
    """
    def __init__(self, tagset_size, charset_size, char_emb_dim, char_rnn_dim, char_rnn_layers, vocab_size,
                 lm_vocab_size, word_emb_dim, word_rnn_dim, word_rnn_layers, dropout, highway_layers=1):
        """
        tagset_size:   number of tags
        charset_size:  size of character vocabulary
        char_emb_dim:  size of character embeddings
        char_rnn_dim:  size of charactor RNNS/LSTMs
        char_rnn_layers: number of layers in character RNN/LSTMs
        vocab_size:    input vocabulary size
        lm_vocab_size: vocabulary size of language models (in-corpus words subject to word frequency threshold)
        word_emb_dim:  size of word embeddings
        word_rnn_dim:  size of word RNN/BLSTM
        word_rnn_layers: number of layers in word RNNs/LSTMs
        dropout:       dropout
        highway_layers: number of transform and gate layers
        """
        
        super(LM_LSTM_CRF, self).__init__()

        self.tagset_size  = tagset_size # this is the size of the outout vocab of the tagging model

        self.charset_size = charset_size
        self.char_emb_dim = char_emb_dim
        self.char_rnn_dim = char_rnn_dim
        self.char_rnn_layers = char_rnn_layers

        self.wordset_size  = vocab_size     # this is the size of the input vocab (embedding layer) of the tagging model
        self.lm_vocab_size = lm_vocab_size  # this is the size of the output vocab of the language model
        self.word_emb_dim  = word_emb_dim
        self.word_rnn_dim  = word_rnn_dim
        self.word_rnn_layers = word_rnn_layers
        
        self.highway_layers = highway_layers

        self.dropout = nn.Dropout(p=dropout)

        # charactor embedding layer
        self.char_embeds = nn.Embedding(num_embeddings=self.charset_size, embedding_dim=self.char_emb_dim)  

        # forward char LSTM
        self.forw_char_lstm = nn.LSTM(input_size=self.char_emb_dim, hidden_size=self.char_rnn_dim, 
                                      num_layers=self.char_rnn_layers, bidirectional=False, dropout = dropout)
        # backward char LSTM
        self.back_char_lstm = nn.LSTM(input_size=self.char_emb_dim, hidden_size=self.char_rnn_dim,
                                      num_layers=self.char_rnn_layers, bidirectional=False, dropout = dropout)
        
        # word embedding layer
        self.word_embeds = nn.Embedding(num_embeddings=self.wordset_size,embedding_dim=self.word_emb_dim)
        # Define word-level bidirection LSTM
        # Take note on the hidden_size
        self.word_blstm   = nn.LSTM(input_size=(self.word_emb_dim+self.char_emb_dim*2), 
                                    hidden_size=self.word_rnn_dim//2, 
                                    # This is because Bi-directional LSTM will concat forward and backward output
                                    # therefore we specify word_rnn_dim//2 but will get output size of word_rnn_dim
                                    num_layers=self.word_rnn_layers,
                                    bidirectional=True,
                                    dropout=dropout
                                    )
        
        # Conditinoal Random Field layer
        self.crf = CRF(hidden_dim=self.word_rnn_dim,tagset_size=self.tagset_size)

        # 3 places that we implemented highway connections
        self.forw_lm_hw = Highway(size=self.char_rnn_dim,
                                  num_layers=self.highway_layers,
                                  dropout=dropout)
        self.back_lm_hw = Highway(size=self.char_emb_dim,
                                  num_layers=self.highway_layers,
                                  dropout=dropout)
        self.subword_hw = Highway(2 * self.char_rnn_dim, 
                                  num_layers=self.highway_layers,
                                  dropout=dropout)
        
        # Linear layers for language models, They are used for "muti-task training" for language models (predicting next word)
        self.forw_lm_out = nn.Linear(self.char_rnn_dim, self.lm_vocab_size)
        self.back_lm_out = nn.Linear(self.char_rnn_dim, self.lm_vocab_size)

    def init_word_embedding(self, embedding):
        """
        Initialize embeddings with pre-trained embeddings.

        embedding: pre-trained embeddings to be loaded
        """
        self.word_embeds.weights = nn.Parameter(embeddings)

    def fine_tune_word_embeddings(self, fine_tune=False):
        """
        Fine-tune embedding layer? (if using pre-trained embedding layer, consider no fine-tuning)

        fine_tune: bool decides if fine_tune
        """
        for p in self.word_embeds.parameters():
            p.requires_grad = fine_tune
    
    def forward(self, cmaps_f, cmaps_b, cmarkers_f, cmarkers_b, wmaps, tmaps, wmap_lengths, cmap_lengths):
        """
        cmaps_f: padded encoded forward  character sequences. (batch_size, char_pad_len)
        cmaps_b: padded encoded backward character sequences. (batch_size, char_pad_len)
        cmarker_f: padded forward character markers.          (batch_size, word_pad_len)
        cmarker_b: padded backward character markers.         (batch_size, word_pad_len)
        wmaps: padded encoded word sequences.                 (batch_size, word_pad_len)
        tmaps: padded tag sequences.                          (batch_size, word_pad_len)
        wmap_lengths: word sequence lengths.                  (batch_size)
        cmap_lengths: character sequence lengths              (batch_size, word_pad_len)
        """

        self.batch_size   = cmaps_f.size(0)
        self.word_pad_len = cmarker_f.size(1)

        # Sort by decreasing true char. sequence length for grouping up for padding later
        cmap_lengths, char_sort_idx = cmap_lengths.sort(dim=0, descending=True)
        

In [7]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My\ Drive

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive
/gdrive/My Drive


In [8]:
!ls

 2018
 2019
'Attention in deep learning.md'
'Colab Notebooks'
 data
 Enoava
'LSTM in PyTorch.md'
'LSTM in PyTorch.pdf'
'Response to comments_MECHMT_2018_1107_-V4_17_Dec.docx'
 Snaps
 Stack-presentation-Dermotologist.png
'Starwars Project Video V2.mp4'
 YOLO.md
 YOLO.pdf


In [0]:
%cd /gdrive/My\ Drive/data/embeddings

/gdrive/My Drive/data/embeddings


In [0]:
!ls

glove.6B.100d.txt


In [0]:
!pwd

/gdrive/My Drive/data/embeddings


In [0]:
from torch.utils.data import Dataset
# Rewrite the __getitem__ and add __len__
class WCDataset(Dataset):
    """
    PyTorch Dataset for the LM-LSTM-CRF model. To be used by a PyTorch DataLoader to feed batches to the model.
    """

    def __init__(self, wmaps, cmaps_f, cmaps_b, cmarkers_f, cmarkers_b, tmaps, wmap_lengths, cmap_lengths):
        """
        :param wmaps: padded encoded word sequences
        :param cmaps_f: padded encoded forward character sequences
        :param cmaps_b: padded encoded backward character sequences
        :param cmarkers_f: padded forward character markers
        :param cmarkers_b: padded backward character markers
        :param tmaps: padded encoded tag sequences (indices in unrolled CRF scores)
        :param wmap_lengths: word sequence lengths
        :param cmap_lengths: character sequence lengths
        """
        self.wmaps = wmaps
        self.cmaps_f = cmaps_f
        self.cmaps_b = cmaps_b
        self.cmarkers_f = cmarkers_f
        self.cmarkers_b = cmarkers_b
        self.tmaps = tmaps
        self.wmap_lengths = wmap_lengths
        self.cmap_lengths = cmap_lengths

        self.data_size = self.wmaps.size(0)

    def __getitem__(self, i):
        return self.wmaps[i], self.cmaps_f[i], self.cmaps_b[i], self.cmarkers_f[i], self.cmarkers_b[i], self.tmaps[i], \
               self.wmap_lengths[i], self.cmap_lengths[i]

    def __len__(self):
        return self.data_size

In [0]:
class ViterbiDecoder():
    """
    Viterbi Decoder.
    """

    def __init__(self, tag_map):
        """
        :param tag_map: tag map
        """
        self.tagset_size = len(tag_map)
        self.start_tag = tag_map['<start>']
        self.end_tag = tag_map['<end>']

    def decode(self, scores, lengths):
        """
        :param scores: CRF scores
        :param lengths: word sequence lengths
        :return: decoded sequences
        """
        batch_size = scores.size(0)
        word_pad_len = scores.size(1)

        # Create a tensor to hold accumulated sequence scores at each current tag
        scores_upto_t = torch.zeros(batch_size, self.tagset_size)

        # Create a tensor to hold back-pointers
        # i.e., indices of the previous_tag that corresponds to maximum accumulated score at current tag
        # Let pads be the <end> tag index, since that was the last tag in the decoded sequence
        backpointers = torch.ones((batch_size, max(lengths), self.tagset_size), dtype=torch.long) * self.end_tag

        for t in range(max(lengths)):
            batch_size_t = sum([l > t for l in lengths])  # effective batch size (sans pads) at this timestep
            if t == 0:
                scores_upto_t[:batch_size_t] = scores[:batch_size_t, t, self.start_tag, :]  # (batch_size, tagset_size)
                backpointers[:batch_size_t, t, :] = torch.ones((batch_size_t, self.tagset_size),
                                                               dtype=torch.long) * self.start_tag
            else:
                # We add scores at current timestep to scores accumulated up to previous timestep, and
                # choose the previous timestep that corresponds to the max. accumulated score for each current timestep
                scores_upto_t[:batch_size_t], backpointers[:batch_size_t, t, :] = torch.max(
                    scores[:batch_size_t, t, :, :] + scores_upto_t[:batch_size_t].unsqueeze(2),
                    dim=1)  # (batch_size, tagset_size)

        # Decode/trace best path backwards
        decoded = torch.zeros((batch_size, backpointers.size(1)), dtype=torch.long)
        pointer = torch.ones((batch_size, 1),
                             dtype=torch.long) * self.end_tag  # the pointers at the ends are all <end> tags

        for t in list(reversed(range(backpointers.size(1)))):
            decoded[:, t] = torch.gather(backpointers[:, t, :], 1, pointer).squeeze(1)
            pointer = decoded[:, t].unsqueeze(1)  # (batch_size, 1)

        # Sanity check
        assert torch.equal(decoded[:, 0], torch.ones((batch_size), dtype=torch.long) * self.start_tag)

        # Remove the <starts> at the beginning, and append with <ends> (to compare to targets, if any)
        decoded = torch.cat([decoded[:, 1:], torch.ones((batch_size, 1), dtype=torch.long) * self.start_tag],
                            dim=1)

        return decoded

In [0]:
import time
import torch
import torch.optim as optim
import os
import sys
from utils import *
from torch.nn.utils.rnn import pack_padded_sequence
from sklearn.metrics import f1_score

In [0]:
task = 'ner'        # tagging task, choose between [ner, pos]
train_file = '/gdrive/My Drive/data/CoNLL-2003/eng.train'
val_file   = '/gdrive/My Drive/data/CoNLL-2003/eng.testa'
test_file  = '/gdrive/My Drive/data/CoNLL-2003/eng.testb'
emb_file   = '/gdrive/My Drive/data/embeddings/glove.6B.100d.txt'
min_word_freq = 5 # threshold for word frequency to be recognized not as xxunk
min_char_freq = 1 # same thing for char frequency
caseless   = True # lowercase everything?
expand_vocab = True # expand model's input vocabulary to the pre-trained embedding vocabulary?

# Model parameters
char_emb_dim = 30 # character embedding size
with open(emb_file, 'r') as f:
    word_emb_dim = len(f.readline().split(' ')) - 1  # word embdding size, "-1" is because in the txt file the first place is the word itself, followed by the actual embeddings
word_rnn_dim = 300  # word BLSTM hidden size
char_rnn_dim = 300  # character RNN size
char_rnn_layers = 1 # number of layers in character RNN
word_rnn_layers = 1 # number of layers in word BLSTM
highway_layers  = 1 # number of layers in highway network
dropout = 0.5       # universal dropout rate
fine_tune_word_embeddings = False

# Training parameters
start_epoch = 0   # start at this epoch
batch_size  = 10  # batch size
lr = 0.015  
lr_decay = 0.05
momentum = 0.9
workers  = 4
epochs   = 200    # number of epochs without triggering early stoping
grad_clip = 5.
print_freq = 100  # print every ___ batches
best_f1  = 0.
checkpoint = None # Model checkpoint to load. None if training from scratch

tag_ind = 1 if task == 'pos' else 3 # choose column in CoNLL 2003 dataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [23]:
global best_f1, epochs_since_improvement, checkpoint, start_epoch, word_map, char_map, tag_map

# Read training and validation data
train_words, train_tags = read_words_tags(train_file, tag_ind, caseless)
val_words, val_tags = read_words_tags(val_file, tag_ind, caseless)

if checkpoint is not None:
    checkpoint = torch.load(checkpoint)
    model = checkpoint['model']
    optimizer = checkpoint['optimizer']
    word_map  = checkpoint['word_map']
    lm_vocab_size = checkpoint['lm_vocab_size']
    tag_map   = checkpoint['tag_map']
    char_map  = checkpoint['char_map']
    start_epoch = checkpoint['epoch'] +1
    best_f1   = checkpoint['f1']
else:
    # create word, char, tag maps
    # maps are essentially dictionaries that map a token to an integer
    word_map, char_map, tag_map = create_maps(train_words+val_words,train_tags+val_tags, min_word_freq, min_char_freq)

    # load pre-trained embeddings, if expand_vocab==True, word_map expand to embedding_word_map
    # lm_vocab_size is the word_map size before expand to "out-of-corpus vocab"
    embeddings, word_map, lm_vocab_size = load_embeddings(emb_file, word_map, expand_vocab)

    model = LM_LSTM_CRF(tagset_size=len(tag_map),
                        charset_size=len(char_map),
                        char_emb_dim=char_emb_dim,
                        char_rnn_dim=char_rnn_dim,
                        char_rnn_layers=char_rnn_layers,
                        vocab_size=len(word_map),       # This is the length after expand
                        lm_vocab_size=lm_vocab_size,    # len(word_map) before expand, not influenced by the embedding vocab
                        word_emb_dim=word_emb_dim,
                        word_rnn_dim=word_rnn_dim,
                        word_rnn_layers=word_rnn_layers,
                        dropout=dropout,
                        highway_layers=highway_layers).to(device)
    model.init_word_embedding(embeddings.to(device)) # initializa embedding layers with pre-trained embeddings.(Essentially we just make it nn.Parameter)
    model.fine_tune_word_embeddings(fine_tune_word_embeddings)    # decide if these nn.Parameters has requires_grad = True (trainable)
    optimizer = optim.SGD(params=filter(lambda p: p.requires_grad, model.parameters()), lr=lr, momentum=momentum)

Embedding length is 100.
You have elected to include embeddings that are out-of-corpus.

Loading embeddings...
'word_map' is being updated accordingly.

Done.
 Embedding vocabulary: 400054
 Language Model vocabulary: 4671.



  "num_layers={}".format(dropout, num_layers))


In [24]:
embeddings.shape

torch.Size([400054, 100])

In [0]:
# Loss funcitons
lm_criterion  = nn.CrossEntropyLoss().to(device)
crf_criterion = ViterbiLoss(tag_map).to(device)

# Since the language model's vocab is restricted on in-corpus indices, encode training/val with only these!
# word_map might have been expanded, and in-corpus words eliminated due to low frequency might still be added because
# they exist in the pre-trained embeddings
temp_word_map = {k: v for k, v in word_map.items() if v <= word_map['<unk>']}

# train_input = (padded_wmaps, padded_cmaps_f, padded_cmaps_b, padded_cmarkers_f, padded_cmarkers_b, 
                #padded_tmaps, wmap_lengths, cmap_lengths)
train_inputs = create_input_tensors(train_words, train_tags, temp_word_map, char_map,
                                        tag_map)
val_inputs = create_input_tensors(val_words, val_tags, temp_word_map, char_map, tag_map)

# DataLoaders
train_loader = torch.utils.data.DataLoader(WCDataset(*train_inputs), batch_size=batch_size, shuffle=True,
                                            num_workers=workers, pin_memory=False)
val_loader = torch.utils.data.DataLoader(WCDataset(*val_inputs), batch_size=batch_size, shuffle=True,
                                             num_workers=workers, pin_memory=False)

In [14]:
len(train_words),len(val_words)

(14041, 3250)

In [0]:
wmaps = list(map(lambda s: list(map(lambda w: temp_word_map.get(w, temp_word_map['<unk>']), s)) + [temp_word_map['<end>']], train_words))

In [0]:
max_num = []
for w_list in wmaps:
    max_num.append(len(w_list))
max(max_num)

114

In [0]:
len(wmaps)

14041

In [0]:
padded_wmaps = train_inputs[0]
padded_wmaps.size()

torch.Size([14041, 114])

In [0]:
len(train_inputs)

8

In [0]:
len(temp_word_map)

4671

In [0]:
len(word_map)

400054

In [0]:
embeddings.shape

torch.Size([400054, 100])

In [0]:
len(train_words), len(train_tags)

(14041, 14041)

In [0]:
train_words[10][0:3], train_tags[10][0:3]

(['spanish', 'farm', 'minister'], ['I-MISC', 'O', 'O'])

In [0]:
train_words[0][0:10], train_tags[0][0:10]

(['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.'],
 ['NNP', 'VBZ', 'JJ', 'NN', 'TO', 'VB', 'JJ', 'NN', '.'])

In [0]:
len(tag_map)

11

In [0]:
word_map['<end>']

4669

In [0]:
word_map['eu']

1

In [0]:
word_map['<unk>']

4670

In [0]:
!cd My\ Drive

In [0]:
!ls

'My Drive'
