In [1]:
import time
import torch
from torch import nn
#from torch.optim import Adam
from torch.optim import SGD
from torchtext.data import Field, BucketIterator
from torchtext.datasets import SequenceTaggingDataset

In [1]:
! pip install ray 

Collecting ray
  Downloading ray-1.8.0-cp38-cp38-win_amd64.whl (18.3 MB)
Collecting grpcio>=1.28.1
  Downloading grpcio-1.41.1-cp38-cp38-win_amd64.whl (3.2 MB)
Collecting redis>=3.5.0
  Downloading redis-3.5.3-py2.py3-none-any.whl (72 kB)
Collecting protobuf>=3.15.3
  Downloading protobuf-3.19.1-cp38-cp38-win_amd64.whl (895 kB)
Installing collected packages: redis, protobuf, grpcio, ray
Successfully installed grpcio-1.41.1 protobuf-3.19.1 ray-1.8.0 redis-3.5.3


In [3]:
#pip install torchtext==0.6.0

In [4]:
import csv

In [5]:
import pandas as pd

import numpy as np 

data_file = pd.read_csv('./data/train', quoting=csv.QUOTE_NONE,sep=" ", names= ['index', 'word', 'tag'])

In [6]:
data_file

Unnamed: 0,index,word,tag
0,1,EU,B-ORG
1,2,rejects,O
2,3,German,B-MISC
3,4,call,O
4,5,to,O
...,...,...,...
204562,1,Swansea,B-ORG
204563,2,1,O
204564,3,Lincoln,B-ORG
204565,4,2,O


In [8]:
# convert 
data_array = data_file.to_numpy()

In [9]:
data_array

array([[1, 'EU', 'B-ORG'],
       [2, 'rejects', 'O'],
       [3, 'German', 'B-MISC'],
       ...,
       [3, 'Lincoln', 'B-ORG'],
       [4, '2', 'O'],
       [1, '-DOCSTART-', 'O']], dtype=object)

In [10]:
### store the lists of sentences into a big list 
 
def data_to_stream(data):
    data_stream = []
    i = 0 
    for index, word, tag in data:
        if index == 1:
            temp = []
            temp.append([index,word, tag])
            
        else:
            temp.append([index ,word,tag ])
            
        if ( (i+1 < len(data)) and data[i+1][0] == 1 ) or (i == len(data)-1):
            data_stream.append(temp)
        
        i += 1
            
    return data_stream 

In [11]:
data_stream = data_to_stream(data_array)

In [12]:
def write_tsv(file_name, data):
    
    with open(file_name,'w') as f:
        for i in range(len(data)):
            if i != 0:
                f.write('\n')
            for (index, w, t) in data[i]:
                #f.write(str(i))
                #f.write('\t')
                f.write(str(w))
                f.write('\t')
                f.write(str(t))
                f.write('\n')

In [13]:
dev_file = pd.read_csv('./data/dev', quoting=csv.QUOTE_NONE, sep=" ", names= ['index', 'word', 'tag'])

In [14]:
#dev_file
# convert 
dev_array = dev_file.to_numpy()
dev_stream = data_to_stream(dev_array)


In [15]:
write_tsv('train_out.tsv',data_stream)
write_tsv('dev_out.tsv',dev_stream)


In [16]:
words = list(set(data_file["word"].values))
tags = list(set(data_file["tag"].values))

# Converting words to numbers and numbers to word
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}


In [76]:
class Corpus(object):
    def __init__(self, input_folder, min_word_freq, batch_size):
        # list all the fields
        self.word_field = Field()
        self.tag_field = Field(unk_token=None)
        
        # create dataset using built-in parser from torchtext
        self.train_dataset, self.val_dataset = SequenceTaggingDataset.splits(
            path=input_folder,
            train="train_out.tsv",
            validation="dev_out.tsv",
            fields=(("word", self.word_field), ("tag", self.tag_field))
        )
        # convert fields to vocabulary list
        self.word_field.build_vocab(self.train_dataset, min_freq=min_word_freq)
        self.tag_field.build_vocab(self.train_dataset)
        # create iterator for batch input
        self.train_iter, self.val_iter = BucketIterator.splits(
            datasets=(self.train_dataset, self.val_dataset),
            batch_size=batch_size
        )
        # prepare padding index to be ignored during model training/evaluation
        self.word_pad_idx = self.word_field.vocab.stoi[self.word_field.pad_token]
        self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token]

In [77]:
corpus = Corpus(
    input_folder=".",
    min_word_freq=3,  # any words occurring less than 3 times will be ignored from vocab
    batch_size=32
)

In [78]:
print(f"Train set: {len(corpus.train_dataset)} sentences")
print(f"dev set: {len(corpus.val_dataset)} sentences")
#print(vars(pos_dataset.examples[0]))

Train set: 14987 sentences
dev set: 3466 sentences


In [79]:
class BiLSTM(nn.Module):
    def __init__(self, input_dim,embedding_dim, hidden_dim, output_dim,n_layers, bidirectional,dropout, pad_idx):
        
        super().__init__()   
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim, 
                            num_layers = n_layers, 
                            bidirectional = bidirectional,
                            dropout = dropout if n_layers > 1 else 0)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.elu = nn.ELU()
        
    def forward(self, text):
        #pass text through embedding layer
        embedded = self.dropout(self.embedding(text))
        
        #pass embeddings into LSTM
        outputs, (hidden, cell) = self.lstm(embedded)
        
        #we use our outputs to make a prediction of what the tag should be
        outputs = self.elu(outputs)
        outputs = self.fc(self.dropout(outputs))
        return outputs

In [80]:
INPUT_DIM = len(corpus.word_field.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 128
N_LAYERS = 1
BIDIRECTIONAL = True
DROPOUT = 0.33
PAD_IDX = corpus.word_pad_idx

model = BiLSTM(INPUT_DIM, 
               EMBEDDING_DIM, 
               HIDDEN_DIM, 
               OUTPUT_DIM, 
               N_LAYERS, 
               BIDIRECTIONAL, 
               DROPOUT, 
               PAD_IDX)

In [81]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean = 0, std = 0.1)

In [82]:
model.apply(init_weights)

BiLSTM(
  (embedding): Embedding(8129, 100, padding_idx=1)
  (lstm): LSTM(100, 256, bidirectional=True)
  (fc): Linear(in_features=512, out_features=128, bias=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (elu): ELU(alpha=1.0)
)

In [83]:
device = torch.device('cuda:0')
model.to(device)

BiLSTM(
  (embedding): Embedding(8129, 100, padding_idx=1)
  (lstm): LSTM(100, 256, bidirectional=True)
  (fc): Linear(in_features=512, out_features=128, bias=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (elu): ELU(alpha=1.0)
)

In [84]:
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [85]:
### define optimizer fuction 
optimizer = SGD(model.parameters(), lr=0.05)

In [86]:
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.001, max_lr=0.1,step_size_up=5,mode="exp_range",gamma=0.85)
#lrs = []

In [87]:
### define loss function 
TAG_PAD_IDX = corpus.tag_field.vocab.stoi[corpus.tag_field.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

criterion = criterion

In [88]:
### train the model 
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        text = batch.word.cuda()
        tags = batch.tag.cuda()
        
        optimizer.zero_grad()
        
        predictions = model(text)
        
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)

        loss = criterion(predictions, tags)
                
        acc = categorical_accuracy(predictions, tags, tag_pad_idx)
        
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [89]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text = batch.word
            text= text.cuda()
            tags = batch.tag
            tags = tags.cuda()
            predictions = model(text)
            
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            
            loss = criterion(predictions, tags)
            
            acc = categorical_accuracy(predictions, tags, tag_pad_idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [90]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs
def categorical_accuracy(preds, y, tag_pad_idx):
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / y[non_pad_elements].shape[0]

In [None]:
config = {
    "l1": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
    "l2": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
    "lr": tune.loguniform(1e-4, 1e-1),
    "batch_size": tune.choice([2, 4, 8, 16])
}

In [91]:
N_EPOCHS = 50

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss, train_acc = train(model, corpus.train_iter, optimizer, criterion, TAG_PAD_IDX)
    valid_loss, valid_acc = evaluate(model, corpus.val_iter, criterion, TAG_PAD_IDX)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    ### save the model 
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'blstm1.pt.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 7s
	Train Loss: 1.066 | Train Acc: 82.70%
	 Val. Loss: 1.479 |  Val. Acc: 78.46%
Epoch: 02 | Epoch Time: 0m 7s
	Train Loss: 0.850 | Train Acc: 83.21%
	 Val. Loss: 1.378 |  Val. Acc: 78.46%
Epoch: 03 | Epoch Time: 0m 7s
	Train Loss: 0.810 | Train Acc: 83.21%
	 Val. Loss: 1.337 |  Val. Acc: 78.46%
Epoch: 04 | Epoch Time: 0m 8s
	Train Loss: 0.794 | Train Acc: 83.21%
	 Val. Loss: 1.312 |  Val. Acc: 78.46%
Epoch: 05 | Epoch Time: 0m 7s
	Train Loss: 0.783 | Train Acc: 83.21%
	 Val. Loss: 1.295 |  Val. Acc: 78.46%
Epoch: 06 | Epoch Time: 0m 7s
	Train Loss: 0.776 | Train Acc: 83.20%
	 Val. Loss: 1.279 |  Val. Acc: 78.46%
Epoch: 07 | Epoch Time: 0m 7s
	Train Loss: 0.770 | Train Acc: 83.22%
	 Val. Loss: 1.268 |  Val. Acc: 78.46%
Epoch: 08 | Epoch Time: 0m 7s
	Train Loss: 0.766 | Train Acc: 83.19%
	 Val. Loss: 1.255 |  Val. Acc: 78.46%
Epoch: 09 | Epoch Time: 0m 7s
	Train Loss: 0.761 | Train Acc: 83.23%
	 Val. Loss: 1.246 |  Val. Acc: 78.46%
Epoch: 10 | Epoch Time: 0m 7

In [92]:
#model.load_state_dict(torch.load('blstm1.pt'))
# create the dev file sentences and actual tags list 
dev_sentences = []
dev_actual_tags = []
for i in range(len(corpus.val_dataset)):
    
    sentence_words = vars(corpus.val_dataset[i])['word']
    dev_sentences.append(sentence_words)
    actual_tag = vars(corpus.val_dataset[i])['tag']
    dev_actual_tags.append(actual_tag)


In [43]:
def tag_sentence(model, sentence, text_field, tag_field):
    
    model.eval()
  
    tokens = sentence
    # convert the word into index 
    numericalized_tokens = [text_field.vocab.stoi[t] for t in tokens]
    # get the index of unkonwn word if have 
    unk_idx = text_field.vocab.stoi[text_field.unk_token]
    # get the unknown word  
    unks = [t for t, n in zip(tokens, numericalized_tokens) if n == unk_idx]
    ## convert it into tensor 
    token_tensor = torch.LongTensor(numericalized_tokens)
    
    token_tensor = token_tensor.unsqueeze(-1)
     # use the model to predict     
    predictions = model(token_tensor)
    
    top_predictions = predictions.argmax(-1)
    
    predicted_tags = [tag_field.vocab.itos[t.item()] for t in top_predictions]
    
    return predicted_tags

In [45]:
## test on CPU 
device = torch.device("cpu")
model.to(device)

BiLSTM(
  (embedding): Embedding(7519, 100, padding_idx=1)
  (lstm): LSTM(100, 256, bidirectional=True)
  (fc): Linear(in_features=512, out_features=128, bias=True)
  (dropout): Dropout(p=0.33, inplace=False)
  (elu): ELU(alpha=1.0)
)

In [46]:
##create the nested list which contain the predicted tagas using our model 
whole_pred_tags = []

for s in dev_sentences:
    
    pred_tags = tag_sentence(model, s, corpus.word_field, corpus.tag_field)
    
    whole_pred_tags.append(pred_tags)
     

In [47]:
def flatten(t):
    return [item for sublist in t for item in sublist]

In [48]:
# flattenn the nested list 
whole_pred_tags_f = flatten(whole_pred_tags)
dev_actual_tags_f = flatten(dev_actual_tags)

In [49]:
from sklearn.metrics import classification_report
print(classification_report(dev_actual_tags_f, whole_pred_tags_f))

              precision    recall  f1-score   support

       B-LOC       0.79      0.75      0.77      1837
      B-MISC       0.74      0.58      0.65       922
       B-ORG       0.77      0.39      0.52      1341
       B-PER       0.87      0.63      0.73      1842
       I-LOC       0.82      0.52      0.64       257
      I-MISC       0.78      0.27      0.40       346
       I-ORG       0.58      0.44      0.50       751
       I-PER       0.71      0.75      0.73      1307
           O       0.95      0.99      0.97     42975

    accuracy                           0.92     51578
   macro avg       0.78      0.59      0.66     51578
weighted avg       0.92      0.92      0.92     51578



In [55]:
### to write the output file 
import pandas as pd

import numpy as np 
import csv
dev_file = pd.read_csv('./data/dev', quoting=csv.QUOTE_NONE, sep=" ", names= ['index', 'word', 'tag'])

# convert 
dev_array = dev_file.to_numpy()
dev_stream = data_to_stream(dev_array)

#### write the output file with predited tags 
import copy
dev_data_list = copy.deepcopy(dev_stream)
for i in range(len(whole_pred_tags)):
    for i1 in range(len(whole_pred_tags[i])):
        #dev_data_list[i][i1].pop() # pop the actual tag 
        dev_data_list[i][i1].append(whole_pred_tags[i][i1])

In [59]:
def write_out_evl(file_name, data):
    
    with open(file_name,'w') as f:
        for i in range(len(data)):
            for (index, w, t1,t2) in data[i]:
                f.write(str(index))
                f.write(' ')
                f.write(str(w))
                f.write(' ')
                f.write(str(t1))
                f.write(' ')
                f.write(str(t2))
                f.write("\n")
                

In [60]:
write_out_evl('dev_output_test',dev_data_list)

In [62]:
#dev_data_list