In [1]:
import time
import torch
from torch import nn
#from torch.optim import Adam
from torch.optim import SGD
from torchtext.data import Field, BucketIterator


In [2]:
from torchtext.datasets import SequenceTaggingDataset

In [3]:
import csv

In [4]:
import pandas as pd

import numpy as np 

data_file = pd.read_csv('train', quoting=csv.QUOTE_NONE,sep=" ", names= ['index', 'word', 'tag'])

In [5]:
data_file

Unnamed: 0,index,word,tag
0,1,EU,B-ORG
1,2,rejects,O
2,3,German,B-MISC
3,4,call,O
4,5,to,O
...,...,...,...
204562,1,Swansea,B-ORG
204563,2,1,O
204564,3,Lincoln,B-ORG
204565,4,2,O


In [6]:
# convert 
data_array = data_file.to_numpy()

In [7]:
def sentence_listing(new_list):
    # convert the list into sentence
    sentence_list = []

    for i in range(len(new_list)):
        if new_list[i][0] == 1:
            temp = []
            temp.append(new_list[i])
        else:
            temp.append(new_list[i])
        if ((i+1) < len(new_list)) and new_list[i+1][0] == 1:
            sentence_list.append(temp)
    return sentence_list

In [8]:
sentence_list = sentence_listing(data_array)

In [9]:
def tsv(file_name, data):
    
    with open(file_name,'w') as f:
        for i in range(len(data)):
            if i != 0:
                f.write('\n')
            for (index, w, t) in data[i]:
                #f.write(str(i))
                #f.write('\t')
                f.write(str(w))
                f.write('\t')
                f.write(str(t))
                f.write('\n')

In [10]:
dev_file = pd.read_csv('dev', quoting=csv.QUOTE_NONE, sep=" ", names= ['index', 'word', 'tag'])

In [12]:
#dev_file
# convert 
dev_array = dev_file.to_numpy()
dev_sentence_list = sentence_listing(dev_array)


In [13]:
tsv('train_out.tsv',sentence_list)
tsv('dev_out.tsv',dev_sentence_list)


In [14]:
class BiLSTMPOSTagger(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim, 
                            num_layers = 1, 
                            bidirectional = True,
                            dropout = dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.elu = nn.ELU()
        
    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        outputs, (hidden, cell) = self.lstm(embedded)
        outputs = self.elu(outputs)
        outputs = self.fc(self.dropout(outputs))
        return outputs

In [15]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean = 0, std = 0.1)

In [16]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


In [17]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        text = batch.word
        tags = batch.tag
        
        optimizer.zero_grad()        
        predictions = model(text)
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)
        loss = criterion(predictions, tags)
                
        acc = categorical_accuracy(predictions, tags, tag_pad_idx)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [18]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text = batch.word
            tags = batch.tag
            
            predictions = model(text)
            
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            
            loss = criterion(predictions, tags)
            
            acc = categorical_accuracy(predictions, tags, tag_pad_idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [19]:
import torchtext.vocab as vocab
custom_embeddings = vocab.Vectors(name = 'glove.6B.100d')

In [20]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs
def categorical_accuracy(preds, y, tag_pad_idx):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / y[non_pad_elements].shape[0]

In [21]:
#create field
word_field = Field()
tag_field = Field(unk_token=None)
train_dataset,val_dataset = SequenceTaggingDataset.splits(
path='.',
train="train_out.tsv",
validation="dev_out.tsv",
fields=(("word",word_field), ("tag",tag_field)))
word_field.build_vocab(train_dataset, min_freq=1)
tag_field.build_vocab(train_dataset)
# create iterator for batch input
train_iter, val_iter = BucketIterator.splits(
    datasets=(train_dataset, val_dataset),
    batch_size=16
)

In [22]:
word_field.build_vocab(train_dataset, vectors = custom_embeddings)
word_pad_idx = word_field.vocab.stoi[word_field.pad_token]
tag_pad_idx = tag_field.vocab.stoi[tag_field.pad_token]


In [23]:
INPUT_DIM = len(word_field.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 128
DROPOUT = 0.33
PAD_IDX = word_pad_idx
model = BiLSTMPOSTagger(INPUT_DIM, 
                        EMBEDDING_DIM, 
                        HIDDEN_DIM, 
                        OUTPUT_DIM, 
                        DROPOUT, 
                        PAD_IDX)

  "num_layers={}".format(dropout, num_layers))


In [24]:
model.apply(init_weights)

BiLSTMPOSTagger(
  (embedding): Embedding(23626, 100, padding_idx=1)
  (lstm): LSTM(100, 256, dropout=0.33, bidirectional=True)
  (fc): Linear(in_features=512, out_features=128, bias=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (elu): ELU(alpha=1.0)
)

In [25]:
model.embedding.weight.data.copy_(word_field.vocab.vectors)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [ 0.0823,  0.3968,  0.6859,  ...,  0.0866, -0.0148, -0.1637],
        [-0.3110, -0.3398,  1.0308,  ...,  0.5317,  0.2836, -0.0640],
        [-0.5832, -0.5807,  0.3504,  ...,  0.4043, -0.0192,  0.0945]])

In [26]:
optimizer = SGD(model.parameters(), lr=0.05)

In [27]:
TAG_PAD_IDX = tag_field.vocab.stoi[tag_field.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

In [None]:
N_EPOCHS = 8

best_train_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion, TAG_PAD_IDX)
    
    end_time = time.time()
    
    if train_loss < best_valid_loss:
        best_train_loss = train_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    print(end_time-start_time,train_loss)

111.08664679527283 0.4995065869936439
113.46793603897095 0.43713759227776094
117.31758618354797 0.393848606479715


In [None]:
dev_sentences = []

dev_actual_tags = []

for i in range(len(val_dataset)):
    sentence_words = vars(val_dataset[i])['word']
    dev_sentences.append(sentence_words)
    actual_tag = vars(val_dataset[i])['tag']
    dev_actual_tags.append(actual_tag)

In [None]:
dev_pred_tags = []
for sentence in dev_sentences:
    tokens = sentence
    list_word_index = [word_field.vocab.stoi[t] for t in tokens]
    unk_index = word_field.vocab.stoi[word_field.unk_token]
    unks = [t for t, n in zip(tokens, list_word_index) if n == unk_index]
    list_word_index_tensor = torch.LongTensor(list_word_index)
    list_word_index_tensor = list_word_index_tensor.unsqueeze(-1)
    predictions = model(list_word_index_tensor)
    top_predictions = predictions.argmax(-1)
    predicted_tags = [tag_field.vocab.itos[t.item()] for t in top_predictions]
    dev_pred_tags.append(predicted_tags)

In [None]:
model

In [None]:
dev_sentences
dev_actual_tags
dev_pred_tags

In [None]:
#### write the output file with predited tags 

import copy

dev_data_list = copy.deepcopy(dev_stream)

for i in range(len(dev_pred_tags)):

    for i1 in range(len(dev_pred_tags[i])):

        #dev_data_list[i][i1].pop() # pop the actual tag 

        dev_data_list[i][i1].append(dev_pred_tags[i][i1])



def write_out_eva(file_name, data):
    
    with open(file_name,'w') as f:
        for i in range(len(data)):
            if i != 0:
                f.write('\n')
            for (index, w, t,p) in data[i]:
                f.write(str(index))
                f.write(' ')
                f.write(str(w))
                f.write(' ')
                f.write(str(t))
                f.write(' ')
                f.write(str(p))
                f.write('\n')

In [None]:
write_out_eva('eva',dev_data_list)

In [None]:
dev_data_list