In [1]:
import torch
from torch import nn
from torch.autograd import Variable
from torch.nn import functional as F
import random
import numpy as np
from tqdm import tqdm
import unicodedata
import string
import re

In [2]:
inFile = open("deu-eng/deu.txt", "r")

In [3]:
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


In [4]:
#From http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1


In [5]:
def processData(filename):
    data = []
    inFile = open(filename,"r")
    for line in tqdm(inFile):
        for element in line.split('\n'):
            for e in element.split('\t'):
                if e != '':
                    e = unicodeToAscii(e)
                    e = normalizeString(e)
                    data.append(e)
    return(data)

def createPairs(data: list, lang1 : str, lang2: str):
    pairs = []
    Xs = []
    Ys = []
    in_lang = Lang(lang1)
    out_lang = Lang(lang2)
    for pair in range(len(data)//2):
        in_lang.addSentence(data[2*pair])
        out_lang.addSentence(data[2*pair+1])
        Xs.append(data[2*pair])
        Ys.append(data[2*pair+1])
        pairs.append(np.array([data[2*pair], data[2*pair+1]]))
    
    return pairs, Xs, Ys, in_lang, out_lang

def preprocessing(filename, lang1="English", lang2="German"):
    data = processData(filename)
    return createPairs(data, lang1, lang2)

In [6]:
dataPairs, Xs, Ys, in_lang, out_lang = preprocessing("deu-eng/deu.txt")

159204it [00:13, 12073.79it/s]


In [7]:
max_len = len(max(Xs, key=len))
print(max_len)

286


## Helper Functions

In [8]:
#From http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1


In [9]:
#OUTPUT SIZE IS A MAGIC NUM HERE. IF IT CHANGES, MUST CHANGE HERE

def indexesFromSentence(lang, sentence):
#     while len(sentence) < 31122:
#         if sentence[-1] == ' ':
#             sentence += '*'
#         elif sentence[-1] == '*':
#             sentence += ' '
    return [lang.word2index[word] for word in sentence.split(' ')]

def variableFromIndexes(lang, sentence):
    idxes = indexesFromSentence(lang, sentence)
    idxes.append(EOS_token)
    return Variable(torch.LongTensor(idxes).view(-1,1))

def variablesFromPairs(pair):
    var1 = variableFromIndexes(in_lang, pair[0])
    var2 = variableFromIndexes(out_lang, pair[1])
    return var1, var2

In [10]:
#From http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

# Neural Network Setup

In [11]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, inp, hidden):
        embed = self.embedding(inp).view(1,1,-1) #reshaping input
        output, next_hidden = self.gru(embed, hidden)
        return output, next_hidden
    
    def initHidden(self):
        return Variable(torch.zeros(1,1, self.hidden_size))

In [12]:
class SimpleDecoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(SimpleDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.relu = nn.RELU()
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.linear = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax()
    
    def forward(self, inp, hidden):
        embed = self.relu(self.embedding(inp))
        output, next_hidden = self.gru(embed, hidden)
        output = self.softmax(self.out(output[0]))
        return output, next_hidden
    
    def initHidden(self):
        return Variable(torch.zeros(1,1, self.hidden_size))

In [13]:
class AttentionDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, max_len=max_len, drop_rate=0.1):
        super(AttentionDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.drop_rate = drop_rate
        self.max_len = max_len
        
#         self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.linear = nn.Linear(self.output_size, hidden_size)
        self.dropper = nn.Dropout(drop_rate)
        self.embed_attention = nn.Linear(hidden_size*2, max_len)
        self.combine_attention = nn.Linear(hidden_size*2, max_len)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
    
    def forward(self, inp, hidden, encoder_output):
        print(self.output_size)
        embed = self.dropper(self.linear(encoder_output).view(1,1,-1))
        attention_weights = F.softmax(self.embed_attention(torch.cat((embed[0], hidden[0]), 1)), dim=1)
        weighted = torch.bmm(attention_weights.unsqueeze(0), encoder_output.unsqueeze(0))
        
        embed_attention = torch.cat((embed[0], weighted[0]), 1)
        attention_combined = F.relu(self.combine_attention(embed_attention).unsqueeze(0))
        
        output, new_hidden = self.gru(attention_combined, hidden)
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, new_hidden, attention_weights
    
    def initHidden(self):
        return Variable(torch.zeros(1,1, self.hidden_size))
    

## Training

In [14]:

def train(inp_var, out_var, encoder, decoder, enc_optim, dec_optim, criterion, max_len=max_len, teacher_forcing=True):
    encoder_hidden = encoder.initHidden()
    
    enc_optim.zero_grad();
    dec_optim.zero_grad();
    input_length = inp_var.size()[0]
    target_length = out_var.size()[0]

    encoder_outputs = Variable(torch.zeros(max_len, encoder.hidden_size))
    
    loss = 0
    
    for inpE in range(input_length):
        encoder_output, encoder_hidden = encoder(inp_var[inpE], encoder_hidden)
        encoder_outputs[inpE] = encoder_output[0][0]
    
    decoder_input = Variable(torch.LongTensor([SOS_token]))
    decoder_hidden = encoder_hidden
    
    if teacher_forcing:
        for inpD in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, out_var[inpD])
            decoder_input += out_var[inpD]
    else:
        for inpD in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            top_val, idx = decoder_output.data.topk(1)
            idx = idx[0][0]
            
            loss += criterion(decoder_output, out_var[inpD])
            decoder_input = Variable(torch.LongTensor([[idx]]))
            
            if idx == EOS_token:
                break
    
    loss.backward() #backprop
    enc_optim.step()#add gradients to weights
    dec_optim.step()
    
    return loss.data[0] / target_length
        
            
        


In [15]:
def trainer(encoder, decoder, num_iters, print_time=1000, learning_rate=0.01):
    timer = time.time()
    enc_optim = torch.optim.SGD(encoder.parameters(),lr=learning_rate)
    dec_optim = torch.optim.SGD(decoder.parameters(),lr=learning_rate)
    criterion = nn.NLLLoss()
    print_loss_total = 0
    
    pairs = [variablesFromPairs(random.choice(dataPairs)) for n in range(num_iters)]
    
    for trial in tqdm(range(num_iters)):
        pair = pairs[trial]
        print(pair[0].size()[0])
        inp = pair[0]
        out = pair[1]
        
        loss = train(inp, out, encoder, decoder, enc_optim, dec_optim, max_len)
        
        print_loss_total += loss
        
        if (trial + 1) % print_time == 0:
            loss_avg = print_loss_total / print_time
            print_loss_total = 0
            print(loss_avg)
            print('%s (%d %d%%) %.4f' % (timeSince(start, (trial + 1) / num_iters), trial + 1, (trial + 1) / n_iters * 100,loss_avg))
        
    
    

In [16]:
def evaluate(encoder, decoder, sentence, max_len=max_len):
    inp = variableFromSentence(in_lang, sentence)
    inSize = inp.size()[0]
    encoder_hidden = encoder.initHidden()
    
    enc_outputs = Variable(torch.zeros(max_len, encoder_hidden))
    
    for i in range(inSize):
        encoder_output, encoder_hidden = encoder(inp[i],
                                                 encoder_hidden)
        encoder_outputs[i] = encoder_outputs[i] + encoder_output[0][0]
    
    decoder_input = Variable(torch.LongTensor([SOS_token]))
    decoder_hidden = encoder_hidden
    decoded_words = []
    decoder_attention = torch.zeros(max_len, max_len)
    
    for di in range(max_len):
        decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
        top_val, idx = decoder_output.data.topk(1)
        decoder_attention[di] = decoder.attention.data
        idx = idx[0][0]
            
        loss += criterion(decoder_output, out_var[inpD])
        decoder_input = Variable(torch.LongTensor([[idx]]))

        if idx == EOS_token:
            decoded_words.append("<EOS>")
            break
        else:
            decoded_words.append(out_lang.index2word[idx])
        
        decoder_input = Variable(torch.LongTensor(idx))
        
    
    return decoded_words, decoder_attentions[:di + 1]

    
    

In [17]:
def evalRandomly(encoder, decoder, lang1_name="English", lang2_name="German", num_iters=10):
    for i in num_iters:
        pair = random.choice(pairs)
        decoded_words, attns = evaluate(encoder, decoder, variableFromSentence(pair[0]))
        print(lang1_name + ": " + pair[0])
        print(lang2_name + ": " + pair[1])
        out_sentence = "".join(decoded_words)
        print("Prediction: " + out_sentence)

In [None]:
hidden_size = 256
encoder1 = Encoder(in_lang.n_words, hidden_size)
attn_decoder1 = AttentionDecoder(hidden_size, out_lang.n_words, drop_rate=0.1)


# if use_cuda:
#     encoder1 = encoder1.cuda()
#     attn_decoder1 = attn_decoder1.cuda()

trainer(encoder1, attn_decoder1, 75000, print_time=5000)
