In [0]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import time
import math

In [0]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [0]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Zа-яА-ЯӨҮөүёЁ.!?]+", r" ", s)
    return s

In [0]:
import pandas as pd
import numpy as np

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")
    #lines = pd.read_excel('finaleng-mon.xlsx')
    lines = pd.read_excel('/content/drive/My Drive/UIC/IDS576/Project/finaleng-mon.xlsx')
    pairs = []
    for i, l in lines.iterrows():
        pairs.append([normalizeString(l[0]), normalizeString(l[1])])
    
    lang1 = "eng"
    lang2 = "mon"

    # Split every line into pairs and normalize
    #pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
MAX_LENGTH = 8

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH #and len(p[1].split(' ')) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [0]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


In [11]:
input_lang, output_lang, pairs = prepareData('eng', 'mon', False)
print(random.choice(pairs))

Reading lines...
Read 1040 sentence pairs
Trimmed to 1038 sentence pairs
Counting words...
Counted words:
eng 1018
mon 1410
['you re extremely ingenious .', 'чи үнэхээр их мэргэн ухаантай юм .']


In [0]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        return outputs, hidden
      

In [0]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Parameter(torch.rand(dec_hid_dim))
    def forward(self, hidden, encoder_outputs):
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2))) 
        energy = energy.permute(0, 2, 1)
        v = self.v.repeat(batch_size, 1).unsqueeze(1)
        attention = torch.bmm(v, energy).squeeze(1)
        return F.softmax(attention, dim=1)

In [0]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        self.out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(0)
        
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs)
        a = a.unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs)
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))

        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        output = self.out(torch.cat((output, weighted, embedded), dim=1))
        
        return output, hidden.squeeze(0)

In [0]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
    
        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        output = trg[0,:]
        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)

        return outputs

In [0]:
INPUT_DIM = input_lang.n_words
OUTPUT_DIM = output_lang.n_words
ENC_EMB_DIM = 128 #256
DEC_EMB_DIM = 128 #256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.2
DEC_DROPOUT = 0.2

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)

In [0]:
optimizer = optim.Adam(model.parameters())

In [0]:
criterion = nn.CrossEntropyLoss()

In [0]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [0]:
def train(model, n_iters, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]
    
    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        src = training_pair[0]
        trg = training_pair[1]

        optimizer.zero_grad()
        output = model(src, trg)
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / n_iters
  


In [0]:
def evaluate(model, n_iters, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]
        
        for iter in range(1, n_iters + 1):
            training_pair = training_pairs[iter - 1]
            src = training_pair[0]
            trg = training_pair[1]
            
            output = model(src, trg, 0) #turn off teacher forcing
            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
        
    return epoch_loss / n_iters

In [0]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [53]:
import time
import math
N_EPOCHS = 10
CLIP = 1
SAVE_DIR = 'models'
  
MODEL_SAVE_PATH = '/content/drive/My Drive/UIC/IDS576/Project/engmon_model.pt'
#MODEL_SAVE_PATH = 'engmon_model.pt'

best_valid_loss = float('inf')
n_iters = min(5000, len(pairs))

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, n_iters, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, n_iters, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 22s
	Train Loss: 4.440 | Train PPL:  84.742
	 Val. Loss: 4.032 |  Val. PPL:  56.396
Epoch: 02 | Time: 0m 23s
	Train Loss: 3.186 | Train PPL:  24.195
	 Val. Loss: 3.335 |  Val. PPL:  28.088
Epoch: 03 | Time: 0m 22s
	Train Loss: 2.336 | Train PPL:  10.338
	 Val. Loss: 2.651 |  Val. PPL:  14.172
Epoch: 04 | Time: 0m 22s
	Train Loss: 1.778 | Train PPL:   5.917
	 Val. Loss: 2.061 |  Val. PPL:   7.855
Epoch: 05 | Time: 0m 23s
	Train Loss: 1.365 | Train PPL:   3.915
	 Val. Loss: 1.626 |  Val. PPL:   5.085
Epoch: 06 | Time: 0m 22s
	Train Loss: 1.111 | Train PPL:   3.036
	 Val. Loss: 1.185 |  Val. PPL:   3.269
Epoch: 07 | Time: 0m 22s
	Train Loss: 0.817 | Train PPL:   2.265
	 Val. Loss: 1.141 |  Val. PPL:   3.131
Epoch: 08 | Time: 0m 22s
	Train Loss: 0.707 | Train PPL:   2.028
	 Val. Loss: 1.017 |  Val. PPL:   2.765
Epoch: 09 | Time: 0m 23s
	Train Loss: 0.573 | Train PPL:   1.774
	 Val. Loss: 0.772 |  Val. PPL:   2.165
Epoch: 10 | Time: 0m 23s
	Train Loss: 0.512 | Train PPL

In [0]:
def translate(model, pair, max_length=MAX_LENGTH):
    
    sentence_pair = tensorsFromPair(pair)
    
    src = sentence_pair[0]
    trg = sentence_pair[1]
    #print(src)
    output = model(src, trg, 0)
    output = output[1:].view(-1, output.shape[-1])
    
    out_pred = output.data.topk(1)[1]

    translation = [output_lang.index2word[word.item()] for word in out_pred]
    
    
    return translation

In [0]:
def evaluateRandomly(model, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = translate(model, pair)
        
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [61]:
evaluateRandomly(model)

> we re just hanging out .
= бид зүгээр л уулзаж байна .
< зүгээр л уулзаж байна . EOS

> we re quieter than tom .
= бид томыг бодвол чимээгүй байна .
< томыг бодвол чимээгүй байна . EOS

> we re friends of tom s .
= бид томын найзууд .
< томын найзууд . EOS

> we re going to eat now .
= бид одоо идэх гэж байна .
< одоо идэх гэж байна . EOS

> he is rather hard to please .
= тэр үнэндээ цамаан .
< үнэндээ цамаан . EOS

> you re under my protection .
= чи миний хамгаалалт доор байна .
< миний хамгаалалт доор байна . EOS

> she s the most beautiful woman .
= тэр бол хамгийн үзэсгэлэнтэй эмэгтэй .
< бол хамгийн үзэсгэлэнтэй . EOS EOS

> they are not at all interested .
= тэд бүгдийг нь сонирхохгүй байна .
< бүгдийг нь сонирхохгүй . EOS EOS

> i m getting married next month .
= би дараа сард гэрлэнэ .
< дараа сард гэрлэнэ . EOS

> i m not making any promises .
= би ямар нэг амлалт өгөхгүй .
< ямар нэг амлалт өгөхгүй . EOS



In [62]:
translate(model, ['you re working hard .', 'чи шаргуу ажиллаж байна . '])

['одоо', 'шаргуу', 'ажиллаж', 'байна', '.', 'EOS']