In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.ticker import FixedLocator
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader

In [24]:
def build_vocab(file_path):
    
    """
    args:
    file_path: path to the dataset file
    
    returns:
    input_vocab: dictionary containing character to index mapping for input language
    output_vocab: dictionary containing character to index mapping for output language
    input_vocab_inv: dictionary containing index to character mapping for input language
    output_vocab_inv: dictionary containing index to character mapping for output language
    
    """
    
    input_vocab = defaultdict(lambda:len(input_vocab))
    output_vocab = defaultdict(lambda:len(output_vocab))
    
    input_vocab['<PAD>'] = 0
    input_vocab['<UNK>'] = 1
    output_vocab['<PAD>'] = 0
    output_vocab['<UNK>'] = 1
    
    output_vocab['<SOS>'] = 2
    output_vocab['<EOS>'] = 3
    
    
    with open(file_path, 'r') as file:
        for line in file:
            input_sent, output_sent = line.strip().split(',')
            input_sent = input_sent.strip().strip("'")
            output_sent = output_sent.strip().strip("'")
            
            for char in input_sent:
                input_vocab[char]
                
            for char in output_sent:
                output_vocab[char]
                
                
    input_vocab = dict(input_vocab)
    output_vocab = dict(output_vocab)
    
    input_vocab_inv = {v:k for k,v in input_vocab.items()}
    output_vocab_inv = {v:k for k,v in output_vocab.items()}
    
    return input_vocab, output_vocab, input_vocab_inv, output_vocab_inv


# Use BPE tokenization to build the vocabulary

def get_stats(vocab):
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs

def merge_vocab(pair, v_in):
    import re
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

In [25]:
class DateTranslationDataset(Dataset):
    def __init__(self, file_path,input_vocab, output_vocab,max_input_len, max_output_len=10):
        
        """
        Args:
        file_path: path to the data file
        input_vocab: input vocabulary
        output_vocab: output vocabulary
        max_input_len: maximum length of input sequence
        max_output_len: maximum length of output sequence is always 10 because ouput format is YYYY-MM-DD
        """
        
        self.input_vocab = input_vocab  
        self.output_vocab = output_vocab
        self.max_input_len = max_input_len
        self.max_output_len = max_output_len
        self.data = self.load_data(file_path)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][0]), torch.tensor(self.data[idx][1])
    
    def load_data(self, file_path):
        data = []
        with open(file_path, 'r') as file:
            for line in file:
                input_sent, output_sent = line.strip().split(',')
                input_sent = input_sent.strip().strip("'")
                output_sent = output_sent.strip().strip("'")
                
                input_ids = [self.input_vocab.get(char, self.input_vocab['<UNK>']) for char in input_sent][:self.max_input_len]
                output_ids = [self.output_vocab.get(char, self.output_vocab['<UNK>']) for char in output_sent][:self.max_output_len]
                
                # padding on left side
                input_ids = [self.input_vocab['<PAD>']]*(self.max_input_len - len(input_ids)) + input_ids
                output_ids = [self.output_vocab['<PAD>']]*(self.max_output_len - len(output_ids)) + output_ids
                
                output_ids = [self.output_vocab['<SOS>']] + output_ids + [self.output_vocab['<EOS>']]
                
                data.append((input_ids, output_ids))
                
        return data
                
               
                
def get_dataloader(file_path, input_vocab, output_vocab, max_input_len, max_output_len, batch_size):
    dataset = DateTranslationDataset(file_path, input_vocab, output_vocab, max_input_len, max_output_len)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    return dataloader

# # Lets test the dataloader
# input_vocab, output_vocab, _, _ = build_vocab('Data/train.txt')
# dataloader = get_dataloader('Data/train.txt', input_vocab, output_vocab, 20, 10, 2)

# for input_batch, output_batch in dataloader:
    
#     print('Input Batch Shape:', input_batch.shape)
#     print('Output Batch Shape:', output_batch.shape)
    
#     # Lets print the first batch
#     print('Input Batch:', input_batch)
    
#     print('Output Batch:', output_batch)
    
#     break

In [26]:
class Encoder(nn.Module):
    def __init__(self, input_vocab_size, emb_dim, enc_hid_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(input_vocab_size, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True, batch_first = True)
        
        
    def forward(self,x):
        
        embedded = self.embedding(x)
        outputs, hidden = self.rnn(embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1) # concatenate the hidden states of the forward and backward RNNs
        return outputs, hidden
    
class BahdanauAttention(nn.Module):
    """
    Use Bahdanau Attention formula is  
    e_ij = v^T * tanh(W_a * s_{i-1} + U_a * h_j) 
    where s_{i-1} is the previous hidden state of the decoder and h_j is the hidden state of the encoder
    
    alpha_ij = softmax(e_ij)

    """
    
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.W_a = nn.Linear(dec_hid_dim, dec_hid_dim)
        self.U_a = nn.Linear(enc_hid_dim*2, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
            
            # hidden = [batch_size, dec_hid_dim]
            # encoder_outputs = [batch_size, seq_len, enc_hid_dim*2]
                        
            projected_hidden = self.W_a(hidden.unsqueeze(1)) # [batch_size, 1, dec_hid_dim]
        
            energy = (torch.tanh(projected_hidden + self.U_a(encoder_outputs))) # [batch_size, seq_len, dec_hid_dim]
            
            
            attention = self.v(energy).squeeze(2) # [batch_size, seq_len]
            
            attention_weights = torch.softmax(attention, dim = 1) # [batch_size, seq_len]
            
            return attention_weights

class ConcatAttention(nn.Module):
    """
    Use Concatenative Attention formula is
    e_ij = v^T * tanh(W_a *[s_{i-1}; h_j])
    where s_{i-1} is the previous hidden state of the decoder and h_j is the hidden state of the encoder
    """
    
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.W_a = nn.Linear((enc_hid_dim*2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
            
            # hidden = [batch_size, dec_hid_dim]
            # encoder_outputs = [batch_size, seq_len, enc_hid_dim*2]
            
            hidden = hidden.unsqueeze(1).repeat(1, encoder_outputs.shape[1], 1) # [batch_size, seq_len, dec_hid_dim]
            
            energy = torch.tanh(self.W_a(torch.cat((hidden, encoder_outputs), dim = 2))) # [batch_size, seq_len, dec_hid_dim]
            
            attention = self.v(energy).squeeze(2) # [batch_size, seq_len]
            
            attention_weights = torch.softmax(attention, dim = 1) # [batch_size, seq_len]
            
            return attention_weights

class Decoder(nn.Module):
    
    """
    Decoder with attention mechanism
    """
    def __init__(self, output_vocab_size, emb_dim, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.output_vocab_size = output_vocab_size
        self.attention = BahdanauAttention(enc_hid_dim, dec_hid_dim)
        # self.concat_attention = ConcatAttention(enc_hid_dim, dec_hid_dim)
        self.embedding = nn.Embedding(output_vocab_size, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim*2) + emb_dim, dec_hid_dim, batch_first = True) # we are passing the context vector and the embedded token as input by concatenating them
        self.fc = nn.Linear(dec_hid_dim, output_vocab_size) 
        
        
    def forward(self, x, hidden, encoder_outputs):
            
            # x = [batch_size]
            # hidden = [batch_size, dec_hid_dim]
            # encoder_outputs = [batch_size, seq_len, enc_hid_dim*2]
            
            x = x.unsqueeze(1) # [batch_size, 1]
            embedded = self.embedding(x) # [batch_size, 1, emb_dim]
            
            attention_weights = self.attention(hidden, encoder_outputs) # [batch_size, seq_len]
            # attention_weights = self.concat_attention(hidden, encoder_outputs) # [batch_size, seq_len]
            attention_weights = attention_weights.unsqueeze(1) # [batch_size, 1, seq_len]
            
            context_vector = torch.bmm(attention_weights, encoder_outputs) # [batch_size, 1, enc_hid_dim*2]
            
            rnn_input = torch.cat((embedded, context_vector), dim = 2) # [batch_size, 1, (enc_hid_dim*2) + emb_dim]
            
            output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0)) # output = [batch_size, 1, dec_hid_dim], hidden = [1, batch_size, dec_hid_dim]
            
            prediction = self.fc(output.squeeze(1)) # [batch_size, output_vocab_size]
            
            return prediction, hidden.squeeze(0), attention_weights.squeeze(1)
            
        
    
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, input, target, teacher_forcing_ratio = 0.5):
        
        # input = [batch_size, seq_len]
        # target = [batch_size, seq_len]
        
        batch_size = input.shape[0]
        target_len = target.shape[1]
        target_vocab_size = self.decoder.output_vocab_size
        
        outputs = torch.zeros(batch_size, target_len-1, target_vocab_size).to(self.device)
        
        attention_scores = torch.zeros(batch_size, target_len-1, input.shape[1]).to(self.device)
        
        
        encoder_outputs, hidden = self.encoder(input) 
        
        x = target[:,0] # <SOS> token
        
        for t in range(1, target_len):
            
            output, hidden, attention_weights = self.decoder(x, hidden, encoder_outputs)
            
            attention_scores[:,t-1] = attention_weights
            
            outputs[:,t-1] = output 
            
            teacher_force = torch.rand(1) < teacher_forcing_ratio
            
            top1 = output.argmax(1)
            
            # x = target[:,t] if teacher_force else top1 # if teacher_force is True, we use the actual target token, else we use the predicted token
            x = top1 # we are not using teacher forcing
        return outputs, attention_scores
    

In [27]:
def train(model,trainloader,epochs,optimizer,criterion,device):
    
    model.train()
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        
        print('*'*20 + f'Epoch {epoch+1}' + '*'*20)
        for src,tgt in trainloader:
            src = src.to(device)
            tgt = tgt.to(device)
            optimizer.zero_grad()
            output,_ = model(src,tgt)
            
            tgt = tgt[:,1:]
            
            output_dim = output.shape[-1]
            output = output.reshape(-1,output_dim)
            tgt = tgt.reshape(-1)
    
            loss = criterion(output,tgt)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch: {epoch+1:02}')
        
        
        print(f'Train Loss: {epoch_loss/len(trainloader):.3f}')
        print(f'Validation Loss: {evaluate(model,validloader,criterion,device):.3f}')
        
    # Save the model in Models folder
    torch.save(model.state_dict(), '../Assignment2/Models/model4.pth')


In [28]:
def evaluate(model,validloader,criterion,device):
    model.eval()
    epoch_loss = 0
    accuracy = 0 
    total = 0
    with torch.no_grad():
        for src,tgt in validloader:
            src = src.to(device)
            tgt = tgt.to(device)
            output,_ = model(src,tgt,0) #turn off teacher forcing  
            output_dim = output.shape[-1]
            output = output.reshape(-1,output_dim)
            tgt = tgt[:,1:]
            tgt = tgt.reshape(-1)
            loss = criterion(output,tgt)
            
           # check if all the characters are correct , if yes then increment the accuracy
            # print(torch.argmax(output,dim=1).shape,tgt.shape)
            accuracy += torch.sum(torch.argmax(output,dim=1) == tgt).item()
            total += tgt.shape[0]
            epoch_loss += loss.item()
            
    print(f'Validation Accuracy: {accuracy/total:.3f}')
        
    return epoch_loss/len(validloader)

In [29]:
def predict(model,src,src_vocab,tgt_vocab,tgt_inv_vocab,max_len,device):
    
 
    src = torch.tensor([src_vocab.get(char,src_vocab['<UNK>']) for char in src]).unsqueeze(0).to(device)
    
    tgt = [tgt_vocab['<SOS>']]+[tgt_vocab['<PAD>']]*max_len+[tgt_vocab['<EOS>']]
    tgt = torch.tensor(tgt).unsqueeze(0).to(device)
    
    outputs,attention_scores = model(src,tgt,0)
    
    outputs = outputs.squeeze(0)
    
    print(outputs.shape)
    decoder_outputs = []
    for output in outputs:
            output = output.argmax(0).item()
            
            if output == tgt_vocab['<EOS>']:
                break
            decoder_outputs.append(tgt_inv_vocab[output])
            # decoder_outputs.append(output)
    # return "".join(decoder_outputs)
    return decoder_outputs,attention_scores

In [30]:
input_vocab, output_vocab, input_vocab_inv,output_vocab_inv = build_vocab('../Assignment2/Data/Assignment2_train.txt')
# /data2/home/kpnaveen/DLNLP/Assignment2/Data/Assignment2_train.txt
input_vocab_size = len(input_vocab)
output_vocab_size = len(output_vocab)
max_input_len = 16
max_output_len = 10
batch_size = 32
embedding_size = 128 # 128 used for model with accuracy 0.88
enc_hidden_size = 128  #use 128 for model with accuracy 0.88
dec_hidden_size = 2*128 #use 2*128 for model with accuracy 0.88
learning_rate = 0.0015
num_epochs = 3 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trainloader = get_dataloader('../Assignment2/Data/Assignment2_train.txt', input_vocab, output_vocab, max_input_len, max_output_len, batch_size)
validloader = get_dataloader('../Assignment2/Data/Assignment2_validation.txt', input_vocab, output_vocab, max_input_len, max_output_len, batch_size)

In [31]:
encoder = Encoder(input_vocab_size, embedding_size, enc_hidden_size)
decoder = Decoder(output_vocab_size, embedding_size, enc_hidden_size, dec_hidden_size)
model = Seq2Seq(encoder, decoder, device).to(device)

In [32]:
criterion = nn.CrossEntropyLoss(ignore_index = output_vocab['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [33]:
train(model,trainloader,num_epochs,optimizer,criterion,device)

********************Epoch 1********************
Epoch: 01
Train Loss: 0.241
Validation Accuracy: 0.947
Validation Loss: 0.135
********************Epoch 2********************
Epoch: 02
Train Loss: 0.137
Validation Accuracy: 0.948
Validation Loss: 0.133
********************Epoch 3********************
Epoch: 03
Train Loss: 0.137
Validation Accuracy: 0.948
Validation Loss: 0.134


In [34]:
model.load_state_dict(torch.load('../Assignment2/Models/model4.pth'))

  model.load_state_dict(torch.load('../Assignment2/Models/model4.pth'))


<All keys matched successfully>

In [35]:
def attention_visualization(model,src,input_vocab,output_vocab,output_vocab_inv,max_output_len,device):
    outputs,attention_scores = predict(model,src,input_vocab,output_vocab,output_vocab_inv,max_output_len,device)
    src_tokens = [char for char in src]
    tgt_tokens = outputs
    
    #convert attention scores to numpy
    
    attention_scores = attention_scores.squeeze(0).cpu().detach().numpy() # [tgt_len, src_len]
    
   
    
    print('Source:', src)
    print('Predicted:', "".join(outputs))
    
    
    fig, ax = plt.subplots(figsize=(12,12))
    cax=ax.matshow(attention_scores, cmap='bone')
    
    ax.set_xticks(np.arange(len(src_tokens)))
    ax.set_yticks(np.arange(len(tgt_tokens)))
  
    
    ax.set_xticklabels(src_tokens, rotation=90,)
    ax.set_yticklabels(tgt_tokens)
    
    ax.set_xlabel('Input Sequence')
    ax.set_ylabel('Output Sequence')
    
    fig.colorbar(cax)
    
    plt.show()
    
    #save the plot
    
    plt.savefig('../Assignment2/plots/attention4.png')
    
    

# attention_visualization(model,'29 March 2022',input_vocab,output_vocab,output_vocab_inv,max_output_len,device)

In [36]:
# input_vocab, output_vocab, input_vocab_inv,output_vocab_inv = build_vocab('../Assignment2/Data/Assignment2_train.txt')
# input_vocab_size = len(input_vocab)
# output_vocab_size = len(output_vocab)
# max_input_len = 20
# max_output_len = 10
# batch_size = 32
# embedding_size = 128
# enc_hidden_size = 128
# dec_hidden_size = 2*128
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# encoder = Encoder(input_vocab_size, embedding_size, enc_hidden_size)
# decoder = Decoder(output_vocab_size, embedding_size, enc_hidden_size, dec_hidden_size)
# model = Seq2Seq(encoder, decoder, device).to(device)



In [37]:
model.load_state_dict(torch.load('../Assignment2/Models/model4.pth'))

  model.load_state_dict(torch.load('../Assignment2/Models/model4.pth'))


<All keys matched successfully>

In [38]:
# Now load validation.txt  where each line is a pair of input and output date separated by a comma, so give input to the predict function and compare whether the output is correct or not
def predict(model,src,src_vocab,tgt_vocab,tgt_inv_vocab,max_len,device):
    
 
    src = torch.tensor([src_vocab.get(char,src_vocab['<UNK>']) for char in src]).unsqueeze(0).to(device)
    
    tgt = [tgt_vocab['<SOS>']]+[tgt_vocab['<PAD>']]*max_len+[tgt_vocab['<EOS>']]
    tgt = torch.tensor(tgt).unsqueeze(0).to(device)
    
    outputs,attention_scores = model(src,tgt,0)
    
    outputs = outputs.squeeze(0)
   
    decoder_outputs = []
    for output in outputs:
            output = output.argmax(0).item()
            
            if output == tgt_vocab['<EOS>']:
                break
            decoder_outputs.append(tgt_inv_vocab[output])
            # decoder_outputs.append(output)
    return "".join(decoder_outputs)
    

In [39]:
actual_outputs = []
predicted_outputs = []

with open('../Assignment2/Data/Assignment2_validation.txt', 'r') as file:
    for line in file:
        input_sent, output_sent = line.strip().split(',')
        input_sent = input_sent.strip().strip("'")
        output_sent = output_sent.strip().strip("'")
        
        output = predict(model, input_sent, input_vocab, output_vocab, output_vocab_inv, max_output_len, device)
        
        actual_outputs.append(output_sent)
        predicted_outputs.append(output)
        
        
        
print('All tests passed')

All tests passed


In [40]:
# save these actual and  predicted in  a txt file
with open('../Assignment2/Data/predictions.txt', 'w') as file:
    for actual, predicted in zip(actual_outputs, predicted_outputs):
        file.write(actual + ',' + predicted + '\n')

In [41]:
def calculate_all_errors(actual_outputs, predicted_outputs):
    
    exact_match_error = 0
    mismatch_error = 0
    position_errors = [0]*10
    
    less_than_10 = 0
    
    for actual, predicted in zip(actual_outputs, predicted_outputs):
        
        if len(actual) != 10 or len(predicted) != 10:
            less_than_10 += 1
            continue
        
        exact_match_error += 1 if actual == predicted else 0
        for i in range(10):
            mismatch_error += 1 if actual[i] != predicted[i] else 0
            position_errors[i] += 1 if actual[i] != predicted[i] else 0
            
    highest_error = position_errors.index(max(position_errors)) + 1
    lowest_error = position_errors.index(min(position_errors)) + 1
    
    print("Excat matches ", exact_match_error)
    print("Less than 10 ", less_than_10)
        
    exact_match_error = (exact_match_error/len(actual_outputs))*100
    mismatch_error = (mismatch_error/(len(actual_outputs)*10))*100
    
    return exact_match_error, mismatch_error, highest_error, lowest_error

exact_match_error, mismatch_error, highest_error, lowest_error = calculate_all_errors(actual_outputs, predicted_outputs)




Excat matches  3842
Less than 10  3


In [42]:
print('Exact Match Error:', exact_match_error)
print('Mismatch Error:', mismatch_error)
print('Highest Error:', highest_error)
print('Lowest Error:', lowest_error)

Exact Match Error: 96.05
Mismatch Error: 0.49750000000000005
Highest Error: 2
Lowest Error: 5


In [62]:
class TestDateTranslationDataset(Dataset):
    def __init__(self, file_path,input_vocab, output_vocab,max_input_len, max_output_len=10):
        
        """
        Args:
        file_path: path to the data file
        input_vocab: input vocabulary
        output_vocab: output vocabulary
        max_input_len: maximum length of input sequence
        max_output_len: maximum length of output sequence is always 10 because ouput format is YYYY-MM-DD
        """
        
        self.input_vocab = input_vocab  
        self.output_vocab = output_vocab
        self.max_input_len = max_input_len
        self.max_output_len = max_output_len
        self.data = self.load_data(file_path)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][0]), torch.tensor(self.data[idx][1])
    
    def load_data(self, file_path):
        data = []
        with open(file_path, 'r') as file:
            for line in file:
                # input_sent, output_sent = line.strip().split(',')
                input_sent = line.strip()
                input_sent = input_sent.strip().strip("'")
                # output_sent = output_sent.strip().strip("'")
                
                input_ids = [self.input_vocab.get(char, self.input_vocab['<UNK>']) for char in input_sent][:self.max_input_len]
                # output_ids = [self.output_vocab.get(char, self.output_vocab['<UNK>']) for char in output_sent][:self.max_output_len]
                
                # padding on left side
                input_ids = [self.input_vocab['<PAD>']]*(self.max_input_len - len(input_ids)) + input_ids
                # output_ids = [self.output_vocab['<PAD>']]*(self.max_output_len - len(output_ids)) + output_ids
                
                # output_ids = [self.output_vocab['<SOS>']] + output_ids + [self.output_vocab['<EOS>']]
                
                # data.append((input_ids, output_ids))
                data.append((input_ids))
                
        return data
                
               
                
def get_test_dataloader(file_path, input_vocab, output_vocab, max_input_len, max_output_len, batch_size):
    dataset = TestDateTranslationDataset(file_path, input_vocab, output_vocab, max_input_len, max_output_len)
    dataloader = DataLoader(dataset, batch_size=batch_size,shuffle=False)
    return dataloader

In [63]:
testloader = get_test_dataloader('../Assignment2/Data/Assignment2_Test.txt', input_vocab, output_vocab, max_input_len, max_output_len, batch_size)

In [64]:
test_actual_outputs = []
test_predicted_outputs = []

with open('../Assignment2/Data/Assignment2_Test.txt', 'r') as file:
    for line in file:
        # input_sent, output_sent = line.strip().split(',')
        input_sent = line.strip()
        input_sent = input_sent.strip().strip("'")
        # output_sent = output_sent.strip().strip("'")
        
        output = predict(model, input_sent, input_vocab, output_vocab, output_vocab_inv, max_output_len, device)
        
        test_actual_outputs.append(output_sent)
        test_predicted_outputs.append(output)
        
        
        
print('All tests passed')

All tests passed


In [65]:
# save these actual and  predicted in  a txt file
with open('../Assignment2/Data/test_predictions.txt', 'w') as file:
    for predicted in test_predicted_outputs:
        file.write(predicted+'\n')

In [66]:
#read the file and make list of the test actual outputs
test_actual_outputs = []
with open('../Assignment2/Data/Assignment2_LabeledTestSet.txt', 'r') as file:
    for line in file:
        # test_actual_outputs.append(line.strip())
        input_sent, output_sent = line.strip().split(',')
        input_sent = input_sent.strip().strip("'")
        output_sent = output_sent.strip().strip("'")

        #make list of input sentences and output sentences
        test_actual_outputs.append(output_sent)
        
        # with open('../Assignment2/Data/Assignment2_validation.txt', 'r') as file:
    

In [67]:
test_actual_outputs

['1733-08-14',
 '1625-11-24',
 '1723-01-24',
 '1551-11-18',
 '1591-08-10',
 '2064-11-03',
 '1554-06-21',
 '1661-12-29',
 '1919-12-15',
 '1727-05-08',
 '2064-02-08',
 '1705-08-27',
 '1572-05-28',
 '1860-11-07',
 '1761-10-19',
 '1816-03-30',
 '1795-08-28',
 '1793-12-21',
 '1980-03-29',
 '1706-08-17',
 '1775-12-11',
 '1899-02-20',
 '2035-09-12',
 '1893-08-24',
 '1897-07-31',
 '1624-04-28',
 '1562-12-30',
 '1580-05-02',
 '1665-01-01',
 '1861-09-22',
 '1872-01-01',
 '1992-04-12',
 '1885-10-20',
 '1649-05-20',
 '1600-08-11',
 '1644-09-22',
 '1572-05-21',
 '1820-06-25',
 '2055-05-18',
 '2057-11-18',
 '1946-11-17',
 '1658-05-25',
 '1740-09-03',
 '1986-11-22',
 '1950-03-24',
 '1864-03-29',
 '1658-01-26',
 '1939-03-07',
 '1793-05-27',
 '1845-12-09',
 '1806-09-05',
 '1648-04-13',
 '1702-04-08',
 '2011-08-28',
 '1956-07-12',
 '1521-11-24',
 '1880-09-07',
 '1568-08-11',
 '1925-10-24',
 '1601-07-07',
 '1952-01-07',
 '2057-04-08',
 '1825-04-20',
 '1674-09-05',
 '1875-07-14',
 '1993-02-13',
 '1764-06-

In [68]:
# now the actual predicted outputs are saved in the file, now we can calculate the errors
test_exact_match_error, test_mismatch_error, test_highest_error, test_lowest_error = calculate_all_errors(test_actual_outputs, test_predicted_outputs)

Excat matches  9589
Less than 10  6


In [69]:
print('Exact Match Error:', test_exact_match_error)
print('Mismatch Error:', test_mismatch_error)
print('Highest Error:', test_highest_error)
print('Lowest Error:', test_lowest_error)

Exact Match Error: 95.89
Mismatch Error: 0.511
Highest Error: 2
Lowest Error: 5
