# neuralmt: default program

In [2]:
from neuralml import *
import os, sys

## Run the default solution on dev

In [None]:
model = Seq2Seq(build=False)
model.load(os.path.join('../data', 'seq2seq_E049.pt'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
# loading test dataset
test_iter = loadTestData(os.path.join('../data', 'input', 'dev.txt'), model.fields['src'],
                            device=device, linesToLoad=sys.maxsize)
results = translate(model, test_iter) # Warning: will take >5mins depending on your machine
print("\n".join(results))

980it [00:56, 17.35it/s]

## Evaluate the default output

In [15]:
from bleu_check import bleu
ref_t = []
with open(os.path.join('../data','reference','dev.out')) as r:
    ref_t = r.read().strip().splitlines()
print(bleu(ref_t, results))

BLEU = 2.49 28.3/5.4/1.3/0.4 (BP = 0.854 ratio = 0.863 hyp_len = 21502 ref_len = 24902)


In [82]:
import os
import re
import sys
import optparse
from tqdm import tqdm

import torch
from torch import nn

import pandas as pd
from torchtext import data

# hyperparameters
class hp:
    # vocab
    pad_idx = 1
    sos_idx = 2

    # architecture
    hidden_dim = 256
    embed_dim = 256
    n_layers = 2
    dropout = 0.2
    batch_size = 32
    num_epochs = 10
    lexicon_cap = 25000

    # training
    max_lr = 1e-4
    cycle_length = 3000

    # generation
    max_len = 50

    # system
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---YOUR ASSIGNMENT---
# -- Step 1: Baseline ---
# The attention module is completely broken now. Fix it using the definition
# given in the HW description.
class AttentionModule(nn.Module):
    def __init__(self, attention_dim):
        """
        You shouldn't deleted/change any of the following defs, they are
        essential for successfully loading the saved model.
        """
        super(AttentionModule, self).__init__()
        self.W_enc = nn.Linear(attention_dim, attention_dim, bias=False)
        self.W_dec = nn.Linear(attention_dim, attention_dim, bias=False)
        self.V_att = nn.Linear(attention_dim, 1, bias=False)
        return

    # Start working from here, both 'calcAlpha' and 'forward' need to be fixed
    def calcAlpha(self, decoder_hidden, encoder_out):
        """
        param encoder_out: (seq, batch, dim),
        param decoder_hidden: (seq, batch, dim)
        """
        seq, batch, dim = encoder_out.shape
        wEnc = self.W_enc(encoder_out)
        wDec = self.W_dec(decoder_hidden)
        scores = self.V_att(torch.tanh(wEnc + wDec))
        # Permute to [Dim, batch, seq]
        scores = scores.permute(2, 1, 0)
        # Take softmax over Seq
        alpha = torch.nn.functional.softmax(scores, dim=-1)
        #print(alpha)
        return alpha

    def forward(self, decoder_hidden, encoder_out):
        """
        encoder_out: (seq, batch, dim),
        decoder_hidden: (seq, batch, dim)
        """
        alpha = self.calcAlpha(decoder_hidden, encoder_out)
        seq, _, dim = encoder_out.shape
        # Needs to be [batch, seq, dim] as when multipled by alpha pf [Dim, batch , seq]
        # We get [batch, 1, dim]
        encoder_out = encoder_out.permute(1, 0, 2)
        context = torch.matmul(alpha, encoder_out).reshape(_, 1, dim)
        #context = (torch.sum(encoder_out, dim=0) / seq).reshape(1, 1, dim)
        return context, alpha.permute(2, 0, 1)


# -- Step 2: Improvements ---
# Implement UNK replacement, BeamSearch, translation termination criteria here,
# you can change 'greedyDecoder' and 'translate'.
def greedyDecoder(decoder, encoder_out, encoder_hidden, maxLen,
                  eos_index):
    seq1_len, batch_size, _ = encoder_out.size()
    target_vocab_size = decoder.target_vocab_size

    outputs = torch.autograd.Variable(
        encoder_out.data.new(maxLen, batch_size, target_vocab_size))
    alphas = torch.zeros(maxLen, batch_size, seq1_len)
    # take what we need from encoder
    decoder_hidden = encoder_hidden[-decoder.n_layers:]
    # start token (ugly hack)
    output = torch.autograd.Variable(
        outputs.data.new(1, batch_size).fill_(eos_index).long())
    for t in range(maxLen):
        output, decoder_hidden, alpha = decoder(
            output, encoder_out, decoder_hidden)
        
        # I have no idea what dimension this alphas expects, just put the seq len in last place as 
        # the others are 1 anyways 
        alpha = alpha.permute(2,1,0)
        
        outputs[t] = output
        alphas[t] = alpha.data
        output = torch.autograd.Variable(output.data.max(dim=2)[1])
        if int(output.data) == eos_index:
            break
    return outputs, alphas.permute(1, 2, 0)


def translate(model, test_iter):
    results = []
    for i, batch in tqdm(enumerate(test_iter)):
        output, attention = model(batch.src)
        output = output.topk(1)[1]
        output = model.tgt2txt(output[:, 0].data).strip().split('<EOS>')[0]
        results.append(output)
    return results

In [87]:
model = Seq2Seq(build=False)
model.load(os.path.join('../data', 'seq2seq_E049.pt'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
# loading test dataset
test_iter = loadTestData(os.path.join('../data', 'input', 'dev.txt'), model.fields['src'],
                            device=device, linesToLoad=sys.maxsize)
results = translate(model, test_iter)



0it [00:00, ?it/s][A[A

3it [00:00, 20.92it/s][A[A

6it [00:00, 20.93it/s][A[A

9it [00:00, 21.22it/s][A[A

12it [00:00, 21.63it/s][A[A

15it [00:00, 21.99it/s][A[A

18it [00:00, 22.16it/s][A[A

21it [00:00, 22.28it/s][A[A

24it [00:01, 22.35it/s][A[A

27it [00:01, 22.41it/s][A[A

30it [00:01, 22.34it/s][A[A

33it [00:01, 22.41it/s][A[A

36it [00:01, 22.51it/s][A[A

39it [00:01, 22.49it/s][A[A

42it [00:01, 22.50it/s][A[A

45it [00:02, 22.55it/s][A[A

48it [00:02, 22.58it/s][A[A

51it [00:02, 22.62it/s][A[A

54it [00:02, 22.43it/s][A[A

57it [00:02, 22.12it/s][A[A

60it [00:02, 22.21it/s][A[A

63it [00:02, 22.34it/s][A[A

66it [00:02, 22.39it/s][A[A

69it [00:03, 22.47it/s][A[A

72it [00:03, 22.52it/s][A[A

75it [00:03, 22.53it/s][A[A

78it [00:03, 22.55it/s][A[A

81it [00:03, 22.52it/s][A[A

84it [00:03, 22.59it/s][A[A

87it [00:03, 22.55it/s][A[A

90it [00:04, 22.60it/s][A[A

93it [00:04, 22.56it/s][A[A

96it [00:04, 2

In [86]:
ref_t = []
with open(os.path.join('../data','reference','dev.out')) as r:
    ref_t = r.read().strip().splitlines()
print(bleu(ref_t, results))

BLEU = 7.64 38.9/12.1/4.3/1.7 (BP = 1.000 ratio = 1.036 hyp_len = 25794 ref_len = 24902)


## Documentation

Write some beautiful documentation of your program here.

## Analysis

Do some analysis of the results. What ideas did you try? What worked and what did not?