In [25]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import time
import math
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils import data
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
import json
from os import system
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu

#### Get data

In [30]:
def getData(mode):
    assert mode == 'train' or mode == 'test'
    dataset = json.load(open('./data/'+mode+'.json', 'r'))
    inputs = []
    labels = []
    for data in dataset:
        inputs.append(data['input'])
        labels.append(data['target'])
    return inputs, labels

### Build vocabulary

In [31]:
class Vocab():
    def __init__(self):
        self.word2index = {'SOS': 0, 'EOS': 1, 'PAD': 2, 'UNK': 3}
        self.index2word = {0: 'SOS', 1: 'EOS', 2: 'PAD', 3: 'UNK'}
        self.n_words = 4
        self.max_length = 0
        self.build_vocab(getData('train')[0])
        
    # input the training data and build vocabulary
    def build_vocab(self, corpus):
        for words in corpus:
            for word in words:
                if len(word) > self.max_length:
                    self.max_length = len(word)
                    
                for char in word:
                    if char not in self.word2index:
                        self.word2index[char] = self.n_words
                        self.index2word[self.n_words] = char
                        self.n_words += 1                      
                    
    # convert word to indices
    def word2indices(self, word, add_eos=False, add_sos=False):
        indices = [self.word2index[char] if char in self.word2index else 3 for char in word]

        if add_sos:
            indices.insert(0, 0)
        if add_eos:
            indices.append(1)
            
        # padding input of same target into same length
        indices.extend([2]*(self.max_length-len(word)))     
        return np.array(indices)
    
    # convert indices to word
    def indices2word(self, indices):
        word = [self.index2word[idx] for idx in indices if idx > 2 ]
        return ''.join(word)

In [32]:
v = Vocab()
t = "hello"
idx = v.word2indices(t)
print(idx)
t = v.indices2word(idx)
print(t)

[20  6 12 12 17  2  2  2  2  2  2  2  2  2  2  2  2  2  2]
hello


### Data Loader

In [33]:
class SpellingLoader(data.Dataset):
    def __init__(self, mode, vocab):
        self.mode = mode   
        self.inputs, self.targets = self.convert_pair()
        self.vocab = vocab
        
    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        input = torch.LongTensor(self.vocab.word2indices(self.inputs[index]))
        target = torch.LongTensor(self.vocab.word2indices(self.targets[index]))
        return input, target
    
    # convert (multi-input)+target into multi-(input+target) pair
    def convert_pair(self):
        input_data, label_data = getData(self.mode)
        inputs_list = []
        labels_list = []
        for inputs, label in zip(input_data, label_data):
            for input in inputs:
                inputs_list.append(input)
                labels_list.append(label)
        return inputs_list, labels_list                

In [34]:
vocab = Vocab()
trainset = SpellingLoader('train', vocab)
testset = SpellingLoader('test', vocab)

In [35]:
trainset[16]

(tensor([ 6, 14, 15, 21, 17, 17, 19,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
          2]),
 tensor([ 6,  4, 14, 15, 21, 17, 17, 19,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
          2]))

### Encoder

In [36]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, -1, self.hidden_size)
        output = embedded
        output, hidden = self.lstm(output, hidden)
        return output, hidden

    def initHidden(self, batch_size=64):
        return (torch.zeros(1, batch_size, self.hidden_size, device=device),
                torch.zeros(1, batch_size, self.hidden_size, device=device))

### Decoder

In [37]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, -1, self.hidden_size)
        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output[0])
        return output, hidden

    def initHidden(self, batch_size=64):
        return (torch.zeros(1, batch_size, self.hidden_size, device=device),
                torch.zeros(1, batch_size, self.hidden_size, device=device))