In [2]:

import time
import torch
from torch import nn
from torch.optim import SGD
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import copy
import torch.nn.functional as F

import pandas as pd
import numpy as np 
import csv

data_file = pd.read_csv('train', quoting=csv.QUOTE_NONE,sep=" ", names= ['index', 'word', 'tag'])
data_array = data_file.to_numpy()

dev_file = pd.read_csv('dev', quoting=csv.QUOTE_NONE,sep=" ", names= ['index', 'word', 'tag'])
dev_array = dev_file.to_numpy()


### concert each word into lower case and generate sentence tag list 
def converter(data):
    train_word = []
    train_tag = []
    i = 0
    for index, word,tag in data:
        if index == 1:
            temp1 = []
            temp2 = []
            temp1.append(str(word).lower()) ## convert the word into lower case 
            temp2.append(tag)
            
        else:
            temp1.append(str(word).lower())
            temp2.append(tag)
        if ( (i+1 < len(data)) and data[i+1][0] == 1 ) or (i == len(data)-1):
            train_word.append(temp1)
            train_tag.append(temp2)
        i += 1
    return train_word, train_tag
    

train_word, train_tag = converter(data_array)
all_data_list = list(zip(train_word, train_tag ))


dev_word, dev_tag = converter(dev_array)


### add words in dev to train vocab since we have glove as our embedding 
train_word.extend(dev_word)


## create the word vocab
w_vocab =  ['<unk>']+['<pad>'] + sorted(set([str(word) for seq in train_word for word in seq]))
w_to_ix = {ch:i for i,ch in enumerate(w_vocab)}
ix_to_w = {i:ch for ch,i in w_to_ix.items()}

## create the tag vab
t_vocab = ['<pad>'] + sorted(set([str(word) for seq in train_tag for word in seq]))
t_to_ix = {ch:i for i,ch in enumerate(t_vocab)}
ix_to_t = {i:ch for ch,i in t_to_ix.items()}



class BiLSTM(nn.Module):
    def __init__(self, input_dim,embedding_dim, hidden_dim, output_dim,n_layers, bidirectional,dropout, pad_idx):
        
        super().__init__()   
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim, 
                            num_layers = n_layers, 
                            bidirectional = bidirectional,
                            dropout = dropout if n_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.elu = nn.ELU()
        
    def forward(self,x,batched=True):
        if batched:
            word_padded = x[0]
            word_padded = word_padded.cuda()
            tag_padded =x[1]
            tag_padded = tag_padded.cuda()
            x_lens = x[2]
            y_lens = x[3]
            #pass text through embedding layer
            embedded = self.dropout(self.embedding(word_padded))
            x_packed = pack_padded_sequence(embedded, x_lens, batch_first=True, enforce_sorted=False)
            #pass packed embeddings into LSTM
            outputs, (hidden, cell) = self.lstm(x_packed)
            # unpack the padded embeding 
            outputs, output_lengths = pad_packed_sequence(outputs, batch_first=True)
            #we use our outputs to make a prediction of what the tag should be
            outputs = self.elu(outputs)
            outputs = self.fc(self.dropout(outputs))
                 
        else:
            #pass text through embedding layer
            embedded = self.dropout(self.embedding(x))
            #pass embeddings into LSTM
            outputs, (hidden, cell) = self.lstm(embedded) 
            #we use our outputs to make a prediction of what the tag should be
            outputs = self.elu(outputs)
            outputs = self.fc(self.dropout(outputs))
                   
    
        return outputs


#initial model with parameters 
INPUT_DIM = len(w_vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 128
N_LAYERS = 1
BIDIRECTIONAL = True
DROPOUT = 0.33
PAD_IDX = w_vocab.index('<pad>')


model = BiLSTM(INPUT_DIM, 
               EMBEDDING_DIM, 
               HIDDEN_DIM, 
               OUTPUT_DIM, 
               N_LAYERS, 
               BIDIRECTIONAL, 
               DROPOUT, 
               PAD_IDX)



### using the model to predict dev
## function that find index of each word 
def finder(x):
    token = []
    for w in x:
        if w in w_to_ix:
            vec =w_to_ix[w]
        else:
            vec = w_to_ix["<unk>"]
            
        token.append(vec)
            
    return token 

### load the best performed model 
#model.load_state_dict(torch.load('blstm2.pt'))
def tag_sentence(model, sentence, w_to_ix, ix_to_t):
    
    model.eval()
   ## convert the word into index 
    numericalized_tokens = finder(sentence)
    ## convert it into tensor 
    token_tensor = torch.LongTensor(numericalized_tokens)
    
    token_tensor = token_tensor.unsqueeze(-1)
     # use the model to predict     
    predictions = model(token_tensor, batched=False)
    
    top_predictions = predictions.argmax(-1)
    
    predicted_tags = [ix_to_t[t.item()] for t in top_predictions]
    
    return predicted_tags



### load the model and test on CPU 
model.load_state_dict(torch.load('blstm2_revised.pt', map_location=torch.device('cpu')))


## READ the dev file 
dev_file = pd.read_csv('dev', quoting=csv.QUOTE_NONE, sep=" ", names= ['index', 'word', 'tag'])

dev_array = dev_file.to_numpy()

dev_word, dev_tag = converter(dev_array)



##create the nested list which contain the predicted tagas using our model 
## predict the dev tag using model for each sentence in dev 
whole_pred_tags = []

for s in dev_word:
    
    pred_tags= tag_sentence(model, s, w_to_ix, ix_to_t)
    
    whole_pred_tags.append(pred_tags)


def data_to_stream(data):
    data_stream = []
    i = 0 
    for index, word, tag in data:
        if index == 1:
            temp = []
            temp.append([index,word, tag])

        else:
            temp.append([index ,word,tag ])

        if ( (i+1 < len(data)) and data[i+1][0] == 1 ) or (i == len(data)-1):
            data_stream.append(temp)

        i += 1

    return data_stream 




### to write the output file 
import pandas as pd

import numpy as np 
import csv
dev_file = pd.read_csv('dev', quoting=csv.QUOTE_NONE, sep=" ", names= ['index', 'word', 'tag'])

# convert the file to sentences 
dev_array = dev_file.to_numpy()
dev_stream = data_to_stream(dev_array)


def write_out_evl(file_name, data): 
    with open(file_name,'w') as f:
        for i in range(len(data)):
            if i != 0:
                f.write('\n')
            for (index, w, t) in data[i]:
                f.write(str(index))
                f.write(' ')
                f.write(str(w))
                f.write(' ')
                f.write(str(t))
                f.write("\n")
                
#### write the output file with predited tags 
import copy
dev_data_list = copy.deepcopy(dev_stream)
for i in range(len(whole_pred_tags)):
    for i1 in range(len(whole_pred_tags[i])):
        dev_data_list[i][i1].pop() # pop the actual tag 
        dev_data_list[i][i1].append(whole_pred_tags[i][i1]) # append predicted tag 
        
write_out_evl('dev2.out',dev_data_list)


### predict the test tag 
### first convert the test word into lower case 
### store the lists of test sentences into a big list 
def testdata_to_stream(data):
    data_stream = []
    i = 0
    for index, word in data:
        if index == 1:
            temp = []
            temp.append(str(word).lower()) ### convert the word into lower case 
        else:
            temp.append(str(word).lower())
        if ( (i+1 < len(data)) and data[i+1][0] == 1 ) or (i == len(data)-1):
            data_stream.append(temp)
        i += 1
    return data_stream


# test file 
test_file = pd.read_csv('test', quoting=csv.QUOTE_NONE, sep=" ", names= ['index', 'word'])
test_data = test_file.to_numpy()
test_data = testdata_to_stream(test_data)

predicted_test = []
for s in test_data:
    test_predicted_tags= tag_sentence(model, s, w_to_ix,ix_to_t)
    
    predicted_test.append(test_predicted_tags)



#test_data to test_file
def data_t_output(data):
    data_stream = []
    i = 0
    for index, word in data:
        if index == 1:
            temp = []
            temp.append([index, word]) 
        else:
            temp.append([index, word])
        if ( (i+1 < len(data)) and data[i+1][0] == 1 ) or (i == len(data)-1):
            data_stream.append(temp)
        i += 1
    return data_stream


test_output = data_t_output(test_file.to_numpy())

#### write the output file
import copy
test_data_list = copy.deepcopy(test_output)
for i in range(len(predicted_test)):
    for i1 in range(len(predicted_test[i])):
        test_data_list[i][i1].append(predicted_test[i][i1])


#write the test2.out 
with open('test2.out','w') as f:
    for i in range(len(test_data_list)):
        if i != 0:
            f.write('\n')
        for (i, w, t) in test_data_list[i]:
            f.write(str(i))
            f.write(" ")
            f.write(str(w))
            f.write(" ")
            f.write(str(t))
            f.write('\n')


## write the prediction file for eval using perl
import copy
dev_data_list = copy.deepcopy(dev_stream)
for i in range(len(whole_pred_tags)):
    for i1 in range(len(whole_pred_tags[i])):
        #dev_data_list[i][i1].pop() # pop the actual tag 
        dev_data_list[i][i1].append(whole_pred_tags[i][i1]) # append predicted tag 
        
def write_prediction(file_name, data):
    
    with open(file_name,'w') as f:
        for i in range(len(data)):
            if i !=0:
                f.write('\n')
            for (index, w, t,t2) in data[i]:
                f.write(str(index))
                f.write(' ')
                f.write(str(w))
                f.write(' ')
                f.write(str(t))
                f.write(' ')
                f.write(str(t2))
                f.write("\n")
                
write_prediction("prediction_task2", dev_data_list)


