In [1]:
import time
import torch
from torch import nn
#from torch.optim import Adam
from torch.optim import SGD
from torchtext.data import Field, BucketIterator


In [2]:
from torchtext.datasets import SequenceTaggingDataset

In [98]:
import csv

In [99]:
import pandas as pd

import numpy as np 

data_file = pd.read_csv('train', quoting=csv.QUOTE_NONE,sep=" ", names= ['index', 'word', 'tag'])

In [100]:
data_file

Unnamed: 0,index,word,tag
0,1,EU,B-ORG
1,2,rejects,O
2,3,German,B-MISC
3,4,call,O
4,5,to,O
...,...,...,...
204562,1,Swansea,B-ORG
204563,2,1,O
204564,3,Lincoln,B-ORG
204565,4,2,O


In [90]:
test_q = data_file[data_file['word'].apply(lambda x: len(str(x)) >= 20)]

In [92]:
#test_q

In [93]:
# convert 
data_array = data_file.to_numpy()

In [94]:
data_array

array([[1, 'EU', 'B-ORG'],
       [2, 'rejects', 'O'],
       [3, 'German', 'B-MISC'],
       ...,
       [3, 'Lincoln', 'B-ORG'],
       [4, '2', 'O'],
       [1, '-DOCSTART-', 'O']], dtype=object)

In [95]:
### store the lists of sentences into a big list 
 
def data_to_stream(data):
    data_stream = []
    i = 0 
    for index, word, tag in data:
        if index == 1:
            temp = []
            temp.append([index,word, tag])
            
        else:
            temp.append([index ,word,tag ])
            
        if ( (i+1 < len(data)) and data[i+1][0] == 1 ) or (i == len(data)-1):
            data_stream.append(temp)
        
        i += 1
            
    return data_stream 

In [96]:
data_stream = data_to_stream(data_array)

In [98]:
def write_tsv(file_name, data):
    
    with open(file_name,'w') as f:
        for i in range(len(data)):
            if i != 0:
                f.write('\n')
            for (index, w, t) in data[i]:
                #f.write(str(i))
                #f.write('\t')
                f.write(str(w))
                f.write('\t')
                f.write(str(t))
                f.write('\n')

In [100]:
dev_file = pd.read_csv('dev', quoting=csv.QUOTE_NONE, sep=" ", names= ['index', 'word', 'tag'])

In [101]:
#dev_file
# convert 
dev_array = dev_file.to_numpy()
dev_stream = data_to_stream(dev_array)


In [102]:
write_tsv('train_out.tsv',data_stream)
write_tsv('dev_out.tsv',dev_stream)


In [20]:
words = list(set(data_file["word"].values))
tags = list(set(data_file["tag"].values))

# Converting words to numbers and numbers to word
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}


In [4]:
class Corpus(object):

    def __init__(self, input_folder, min_word_freq, batch_size):
        # list all the fields
        #self.w_index = Field()
        self.word_field = Field(lower=True)
        self.tag_field = Field(unk_token=None)
        
        # create dataset using built-in parser from torchtext
        self.train_dataset, self.val_dataset = SequenceTaggingDataset.splits(
            path=input_folder,
            train="train_out.tsv",
            validation="dev_out.tsv",
            fields=(("word", self.word_field), ("tag", self.tag_field))
        )
        # convert fields to vocabulary list
        self.word_field.build_vocab(self.train_dataset, min_freq=min_word_freq)
        self.tag_field.build_vocab(self.train_dataset)
        # create iterator for batch input
        self.train_iter, self.val_iter = BucketIterator.splits(
            datasets=(self.train_dataset, self.val_dataset),
            batch_size=batch_size
        )
        # prepare padding index to be ignored during model training/evaluation
        self.word_pad_idx = self.word_field.vocab.stoi[self.word_field.pad_token]
        self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token]

In [5]:
corpus = Corpus(
    input_folder=".",
    min_word_freq=2,  # any words occurring less than 3 times will be ignored from vocab
    batch_size=32
)

In [6]:
print(f"Train set: {len(corpus.train_dataset)} sentences")
print(f"dev set: {len(corpus.val_dataset)} sentences")
#print(vars(pos_dataset.examples[0]))

Train set: 14987 sentences
dev set: 3466 sentences


In [83]:
print(vars(corpus.train_dataset[0]))

{'word': ['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.'], 'tag': ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']}


In [89]:
class BiLSTMPOSTagger(nn.Module):
    def __init__(self, 
                 input_dim, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim, 
                 n_layers, 
                 bidirectional, 
                 dropout, 
                 pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim, 
                            num_layers = n_layers, 
                            bidirectional = bidirectional,
                            dropout = dropout if n_layers > 1 else 0)
        
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.elu = nn.ELU()
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        #pass text through embedding layer
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pass embeddings into LSTM
        outputs, (hidden, cell) = self.lstm(embedded)
        
        #outputs holds the backward and forward hidden states in the final layer
        #hidden and cell are the backward and forward hidden and cell states at the final time-step
        
        #we use our outputs to make a prediction of what the tag should be
        outputs = self.elu(outputs)
        outputs = self.fc(self.dropout(outputs))
        
        
        #outputs = self.elu()
        
        #predictions = [sent len, batch size, output dim]
        
        return outputs

In [90]:
INPUT_DIM = len(corpus.word_field.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 128
N_LAYERS = 1
BIDIRECTIONAL = True
DROPOUT = 0.33
PAD_IDX = corpus.word_pad_idx

model = BiLSTMPOSTagger(INPUT_DIM, 
                        EMBEDDING_DIM, 
                        HIDDEN_DIM, 
                        OUTPUT_DIM, 
                        N_LAYERS, 
                        BIDIRECTIONAL, 
                        DROPOUT, 
                        PAD_IDX)

In [91]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean = 0, std = 0.1)

In [92]:
model.apply(init_weights)

BiLSTMPOSTagger(
  (embedding): Embedding(10952, 100, padding_idx=1)
  (lstm): LSTM(100, 256, bidirectional=True)
  (fc): Linear(in_features=512, out_features=128, bias=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (elu): ELU(alpha=1.0)
)

In [93]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,894,048 trainable parameters


In [94]:
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [95]:
#print(model.embedding.weight.data)

In [96]:
### define optimizer fuction 
optimizer = SGD(model.parameters(), lr=0.05)
#from torch.optim import Adam
#optimizer = Adam(model.parameters())

In [48]:
### define loss function 
TAG_PAD_IDX = corpus.tag_field.vocab.stoi[corpus.tag_field.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

In [49]:
### train the model 
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        text = batch.word
        tags = batch.tag
        
        optimizer.zero_grad()
        
        #text = [sent len, batch size]
        
        predictions = model(text)
        
        #predictions = [sent len, batch size, output dim]
        #tags = [sent len, batch size]
        
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)
        
        #predictions = [sent len * batch size, output dim]
        #tags = [sent len * batch size]
        
        loss = criterion(predictions, tags)
                
        acc = categorical_accuracy(predictions, tags, tag_pad_idx)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [50]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text = batch.word
            tags = batch.tag
            
            predictions = model(text)
            
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            
            loss = criterion(predictions, tags)
            
            acc = categorical_accuracy(predictions, tags, tag_pad_idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [51]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs
def categorical_accuracy(preds, y, tag_pad_idx):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / y[non_pad_elements].shape[0]

In [94]:
N_EPOCHS = 3

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, corpus.train_iter, optimizer, criterion, TAG_PAD_IDX)
    valid_loss, valid_acc = evaluate(model, corpus.val_iter, criterion, TAG_PAD_IDX)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
       # torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 49s
	Train Loss: 0.673 | Train Acc: 83.41%
	 Val. Loss: 1.007 |  Val. Acc: 78.61%
Epoch: 02 | Epoch Time: 1m 37s
	Train Loss: 0.650 | Train Acc: 83.69%
	 Val. Loss: 0.962 |  Val. Acc: 78.75%
Epoch: 03 | Epoch Time: 1m 36s
	Train Loss: 0.628 | Train Acc: 84.07%
	 Val. Loss: 0.922 |  Val. Acc: 78.83%


In [95]:
global_predictions= []
global_tags = []
for batch in corpus.val_iter:
    text = batch.word
    tags = batch.tag
    predictions = model(text)
    predictions = predictions.view(-1, predictions.shape[-1])
    tags = tags.view(-1)
    max_preds = predictions.argmax(dim = 1, keepdim = True)
    non_pad_elements = (tags!= tag_pad_idx).nonzero()
    pred_y = max_preds[non_pad_elements].squeeze(1)
    real_y = tags[non_pad_elements]
    global_predictions += pred_y
    global_tags += real_y



In [96]:
global_predictions = np.array(global_predictions).flatten()

In [97]:
global_tags = np.array(global_tags).flatten()

In [98]:
from sklearn.metrics import classification_report
print(classification_report(global_tags, global_predictions))

              precision    recall  f1-score   support

           1       0.84      1.00      0.91     42975
           2       0.92      0.02      0.04      1837
           3       0.76      0.03      0.06      1842
           4       0.33      0.00      0.00      1341
           5       0.68      0.10      0.17      1307
           6       0.00      0.00      0.00       751
           7       0.00      0.00      0.00       922
           8       0.00      0.00      0.00       257
           9       0.00      0.00      0.00       346

    accuracy                           0.84     51578
   macro avg       0.39      0.13      0.13     51578
weighted avg       0.78      0.84      0.77     51578



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
