# Project 2

##### import needed packages

In [1]:
from tqdm import tqdm_notebook, tqdm
from collections import defaultdict,Counter
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from random import shuffle

# Classes and Functions

## Data Manipulation

### Cleaning and preprocessing

In [2]:
def preprocess(train_data, val_data, test_data, vocab_size=10000):
    
    # loop over all the given files
    for data in [train_data, val_data, test_data]:
        # contains a source and a target file
        for k,v in data.items():
            tokenized_path = v[:v.find('.')] + '_tokenized.{}'.format(k)

            # Tokenize 
            tokenize_command = 'perl tools/mosesdecoder/scripts/tokenizer/tokenizer.perl -l {lang} < {file_path} > {output_path}'.format(
                lang=k, file_path=v, output_path=tokenized_path)
            print('tokenize command:\t{}'.format(tokenize_command))
            
            # Lowercase
            lowercase_path = tokenized_path[:tokenized_path.find('.')] + '_lowercased.{}'.format(k)
            lowercase_command = 'perl tools/mosesdecoder/scripts/tokenizer/lowercase.perl < {file_path} > {output_path}'.format(
                file_path=tokenized_path, output_path=lowercase_path)
            print('lowercase command:\t{}\n'.format(lowercase_command))
            
    # BPE
    # Get vocabulary using train data
    script_name = 'python tools/subword-nmt/subword_nmt/learn_joint_bpe_and_vocab.py'
    args = ' --input {train_en} {train_fr} -s {num_symbols} -o {codes_file} --write-vocabulary {vocab_file}.en {vocab_file}.fr'
    substr_index = train_data['en'].find('/')
    vocab_file_name = train_data['en'][:substr_index] + '/vocab'
    codes_file_name = train_data['en'][:substr_index] + '/codes.bpe'
    learn_vocab_command = script_name + args.format(
        train_en='data/train/train_tokenized_lowercased.en',
        train_fr='data/train/train_tokenized_lowercased.fr',
        num_symbols=str(vocab_size),
        codes_file=codes_file_name,
        vocab_file=vocab_file_name
    )
    print('learn vocab command:\t{}'.format(learn_vocab_command))
    
    # Process all files the same way for consistency
    script_name = 'python tools/subword-nmt/subword_nmt/apply_bpe.py'
    for data in [train_data, val_data, test_data]:
        for k,v in data.items():
            args = ' -c {codes_file} --vocabulary {vocab_file}.{lang} --vocabulary-threshold 50 < {train_file}.{lang} > {train_file}_bpe.{lang}'
            train_file_name = v[:v.find('.')] + '_tokenized_lowercased'
            bpe_command = script_name + args.format(
                codes_file=codes_file_name,
                vocab_file=vocab_file_name,
                lang=k,
                train_file=train_file_name
            )
            print('bpe command:\t{}'.format(bpe_command))

### Building dictionaries and vocabularies

In [3]:
UNK = '<UNK>'
START = '<START>'
END = '<END>'

In [85]:
# do we want to do the preprocessing also here?? not really right, since it is a bit to much to run every time
# what do you mean preprocessing? tokenization, lowercasing, bpe? yes , that stuff. because i noticed u use a vocab size there as well
# oh, no. let's do that offline, not here. yeah, but that vocab size is different. it's only for the bpe to limit its use somehow. nothing
# to do with OUR vocab size
class DataProcessor():
    def __init__(self, file_name, vocab_size):
        self.max_sentence_length = -1
        self.vocab_size = vocab_size
        
        self.file_name = file_name
        self.sentences = self.load_data()
        self.vocab,self.vocab_size = self.build_vocab()
        self.w2i, self.i2w = self.build_dicts()
        
        
    def load_data(self):
        sentences = []
        with open(self.file_name, 'r') as f:
            for raw_line in f:
                line = '{s} {l} {e}'.format(s=START, l=raw_line, e=END)
                sentences.append(line.split())                
        return sentences
    
    def build_dicts(self):
        """
        creates lookup tables to find the index given the word 
        and the otherway around 
        """
        w2i = defaultdict(lambda: w2i[UNK])# would something like this work? not entirely, needs some tweaking
        i2w = dict()
        for i,w in enumerate(self.vocab):
            i2w[i] = w
            w2i[w] = i
        return w2i, i2w
    
    
    def build_vocab(self): 
        """
        builds a vocabulary with the most occuring words, in addition to
        the UNK token at index 0.
        START and END tokens are added to the vocabulary throught the
        preprocessed sentences.
        with vocab size none, all existing words in the data are used
        """
        vocab = Counter()
        for s in self.sentences:
            l = len(s)
            if len(s) > self.max_sentence_length:
                self.max_sentence_length = l
            for w in s:
                vocab[w] += 1

        vocab = [k for k,_ in vocab.most_common(self.vocab_size)]
        vocab = [UNK] + vocab
        return vocab,len(vocab)                

### Getting data batches

In [100]:
# shouldn't this be a function of the DataProcessor class? I guess it could be, but don't know how to make it an iterator
def batch_generator(data_processor, batch_size):
    idx = np.arange(len(data_processor.sentences))
    
    while True:
        shuffle(idx)
        batch_idx = [idx[i:i + batch_size] for i in range(0, len(idx) - (len(idx)%batch_size), batch_size)] # X[i*n_batches:(i+1)*n_batches]

        for b_idx in batch_idx:
            b_words = np.zeros([batch_size, data_processor.max_sentence_length])
            b_positions = np.zeros([batch_size, data_processor.max_sentence_length])
            
            for i, bi in enumerate(b_idx):
                sent = data_processor.sentences[bi]
                b_words[i, :len(sent)] = np.array([data_processor.w2i[w] for w in sent])
                b_positions[i, : len(sent)] = np.array([i for i in range(len(sent))])
                
            yield (torch.from_numpy(b_words).type(torch.LongTensor),
                   torch.from_numpy(b_positions).type(torch.LongTensor))

## Sequence 2 Sequence RNN's

### Encoders

In [8]:
class Encoder(nn.Module):
    def __init__(self, data_processor, embeddings_dim=200):
        super().__init__()
        self.data_processor = data_processor
        self.word_embeddings = nn.Embedding(self.data_processor.vocab_size, embeddings_dim)
        self.pos_embeddings = nn.Embedding(self.data_processor.max_sentence_length, embeddings_dim)
        
    def forward(self, word_batch, pos_batch): 
        word_emb = self.word_embeddings(word_batch)
        pos_emb = self.pos_embeddings(pos_batch)
        full_emb = torch.add(word_emb,pos_emb) 
        mean_emb = torch.mean(full_emb,1)
        return full_emb, mean_emb

### Decoders

In [9]:
class Decoder(nn.Module):
    def __init__(self, data_processor, hidden_dim=200, embeddings_dim=200):
        super().__init__()
        self.data_processor = data_processor
        self.target_embeddings = nn.Embedding(self.data_processor.vocab_size, embeddings_dim)
        
#         self.attention = nn.Linear(input_dim,max_pos) 
        self.gru = nn.GRU(embeddings_dim, hidden_dim) # gru is an LSTM, and has 2 outputs
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, encoded, hidden, batch):
#         # attention
#         att = self.attention(encoded)
#         att = torch.dot(att,batch)
        output,hidden = self.gru(att, hidden)
        output = self.softmax(self.out(output[0]))
        emb = self.target_embedding(output)
        return emb, hidden
    
    def initword(self):
        return self.target_embeddings(self.data_processor.w2i[START])

# Running the Code

## Define Hyper Parameters

#### files

In [88]:
train_data = {'en': 'data/train/train.en', 'fr': 'data/train/train.fr'}
val_data = {'en': 'data/val/val.en', 'fr': 'data/val/val.fr'}
test_data = {'en': 'data/test/test_2017_flickr.en', 'fr': 'data/test/test_2017_flickr.fr'}
source_train_file = 'data/train/train_tokenized_lowercased_bpe.fr'
target_train_file = 'data/train/train_tokenized_lowercased_bpe.en'
source_val_file = 'data/val/val_tokenized_lowercased_bpe.fr'
target_val_file = 'data/val/val_tokenized_lowercased_bpe.en'
source_test_file = 'data/test/test_2017_flickr_tokenized_lowercased_bpe.fr'
target_test_file = 'data/test/test_2017_flickr_tokenized_lowercased_bpe.en'

#### Network Parameters

In [89]:
learning_rate = 1e-3
epochs = 1
batch_size = 16

source_vocab_size = 30000 # How can these be hyper parameters??? This is len(w2i_en) and len(w2i_fr)
target_vocab_size = 30000
hidden_dims = 256
source_embedding_dims = 256
target_embedding_dims = 256

## process data

perform the preprocessing

In [None]:
preprocess(train_data, val_data, test_data, source_vocab_size)

process data

In [91]:
source_processor = DataProcessor(source_train_file,source_vocab_size)
target_processor = DataProcessor(target_train_file,target_vocab_size)

## setup the Network

In [13]:
enc = Encoder(source_processor,embeddings_dim=source_embedding_dims)
dec = Decoder(target_processor,hidden_dim=hidden_dims,embeddings_dim=target_embedding_dims)
params = list(enc.parameters()) + list(dec.parameters())
opt = Adam(params, lr=learning_rate)

## training

In [101]:
gen = batch_generator(source_processor, batch_size)

for epoch in range(epochs):
    opt.zero_grad()
    word_input_batch, pos_input_batch = next(gen)
    enc(word_input_batch, pos_input_batch)
    

(tensor([[   2,    1,   61,  ...,    0,    0,    0],
        [   2,   41,   57,  ...,    0,    0,    0],
        [   2,    1,   70,  ...,    0,    0,    0],
        ...,
        [   2,   32,   71,  ...,    0,    0,    0],
        [   2,   41,   57,  ...,    0,    0,    0],
        [   2,   41,   64,  ...,    0,    0,    0]]), tensor([[  0,   1,   2,  ...,   0,   0,   0],
        [  0,   1,   2,  ...,   0,   0,   0],
        [  0,   1,   2,  ...,   0,   0,   0],
        ...,
        [  0,   1,   2,  ...,   0,   0,   0],
        [  0,   1,   2,  ...,   0,   0,   0],
        [  0,   1,   2,  ...,   0,   0,   0]]))


## Evaluation

In [None]:
# Don't forget to do:
# sed -r 's/(@@ )|(@@ ?$)//g' 