## Task 4

Apply seq2seq model (you can modify the code from this tutorial: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html) to compute grapheme to phoneme conversion for English. Train the model on dev_cmu_dict.txt and test it on test_cmu_dict.txt. Report accuracy of your solution using two metrics:
* exact match (how many words are perfectly converted to phonemes)
* exact match without stress (how many words are perfectly converted to phonemes when we remove the information about stress)


In [1]:
from __future__ import unicode_literals, print_function, division

import itertools
import random
from io import open

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import preprocessing
from torch import optim
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader
from tqdm import tqdm

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
teacher_forcing_ratio = 0.9

In [279]:
class CmuDataset(torch.torch.utils.data.Dataset):
    def __init__(self, path, device):
        self.data = open(path).readlines()
        self.device = device
        self.graphems = []
        self.phonems = []

        for line in self.data:
            splitted = line.split()
            self.graphems.append(splitted[0])
            self.phonems.append(splitted[1:])

        self.unique_phonems = list(set(list(itertools.chain.from_iterable(self.phonems)))) + ["EOS", "SOS"]
        self.unique_graphems = list(set(''.join(self.graphems))) + ["EOS", "SOS"]

    def fit_encoders(self):
        graphems_le = preprocessing.LabelEncoder()
        phonems_le = preprocessing.LabelEncoder()

        graphems_le.fit(self.unique_graphems)
        phonems_le.fit(self.unique_phonems)

        return graphems_le, phonems_le

    def transform_data(self, graphems_le, phonems_le, max_len):
        # self.encoded_phonems = [torch.tensor(np.array(phonems_le.transform(p + (max_len - len(p)) * ["EOS"])), device = self.device) for p in self.phonems]
        # self.encoded_graphems = [torch.tensor(np.array(graphems_le.transform(list(g) + (max_len - len(g)) * ["EOS"])), device = self.device) for g in self.graphems]
        self.encoded_phonems = [torch.tensor(np.array(phonems_le.transform(p)), device = self.device) for p in self.phonems]
        self.encoded_graphems = [torch.tensor(np.array(graphems_le.transform(list(g))), device = self.device) for g in self.graphems]
    def __len__(self):
        return len(self.phonems)

    def __getitem__(self, index):
        return self.encoded_graphems[index], self.encoded_phonems[index],

In [282]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [283]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=35):

    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor(np.array([phonems_le.transform(["SOS"])]), device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, torch.tensor([target_tensor[di]], device=device))
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input
            loss += criterion(decoder_output, torch.tensor([target_tensor[di]], device=device))
            if decoder_input.item() == phonems_le.transform(["EOS"])[0]:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [284]:
def trainIters(encoder, decoder, n_iters, print_every=1000, learning_rate=0.01):

    print_loss_total = 0  # Reset every print_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    criterion = nn.NLLLoss()

    for iter in tqdm(range(n_iters)):
        for idx, (x, y) in enumerate(dataloader):
            loss = train(x[0], y[0], encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(print_loss_avg)

In [None]:
def evaluate(encoder, decoder, word, max_len):
    with torch.no_grad():
        input_tensor = torch.tensor(np.array(graphems_le.transform(list(word))), device=device)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_len, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor(np.array([phonems_le.transform(["SOS"])]), device=device)

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)

            topv, topi = decoder_output.data.topk(1)
            if topi.item() == phonems_le.transform(["EOS"])[0]:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(phonems_le.inverse_transform([topi.item()]))

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [None]:
dataset = CmuDataset('/home/maria/Documents/NLP/data/assignment_5/cmu_dict_dev.txt', device)
graphems_le, phonems_le = dataset.fit_encoders()
dataset.transform_data(graphems_le, phonems_le, 35)
# unusable with batch_size > 1
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [285]:
encoder = EncoderRNN(len(dataset.unique_graphems) + 2, 100).to(device)
decoder = DecoderRNN(100, len(dataset.unique_phonems) + 2).to(device)

In [293]:
trainIters(encoder, decoder, 10, print_every=1, learning_rate=0.01)

 10%|█         | 1/10 [20:47<3:07:08, 1247.64s/it]

55858.91606174967


 20%|██        | 2/10 [40:35<2:41:41, 1212.73s/it]

55171.92731910668


 30%|███       | 3/10 [1:00:11<2:19:28, 1195.54s/it]

54707.34301628322


 40%|████      | 4/10 [1:19:48<1:58:50, 1188.39s/it]

53871.26754682609


 50%|█████     | 5/10 [1:39:20<1:38:31, 1182.40s/it]

53558.53957759402


 60%|██████    | 6/10 [1:59:01<1:18:48, 1182.19s/it]

53375.99866441684


 70%|███████   | 7/10 [2:18:26<58:49, 1176.41s/it]  

53073.0955572138


 80%|████████  | 8/10 [2:37:52<39:05, 1172.99s/it]

53044.76444049645


 90%|█████████ | 9/10 [2:57:23<19:32, 1172.34s/it]

53014.17376639507


 90%|█████████ | 9/10 [3:01:35<20:10, 1210.58s/it]


KeyboardInterrupt: 

In [295]:
evaluate(encoder, decoder, "forwarded", 30)

[array(['F'], dtype='<U3'),
 array(['AO1'], dtype='<U3'),
 array(['R'], dtype='<U3'),
 array(['W'], dtype='<U3'),
 array(['ER0'], dtype='<U3'),
 array(['D'], dtype='<U3'),
 array(['AH0'], dtype='<U3'),
 array(['D'], dtype='<U3'),
 array(['AH0'], dtype='<U3'),
 array(['N'], dtype='<U3'),
 array(['S'], dtype='<U3'),
 array(['IH0'], dtype='<U3'),
 array(['T'], dtype='<U3'),
 array(['S'], dtype='<U3'),
 array(['IH0'], dtype='<U3'),
 array(['R'], dtype='<U3'),
 array(['ER0'], dtype='<U3'),
 array(['ER0'], dtype='<U3'),
 array(['ER0'], dtype='<U3'),
 array(['ER0'], dtype='<U3'),
 array(['ER0'], dtype='<U3'),
 array(['ER0'], dtype='<U3'),
 array(['ER0'], dtype='<U3'),
 array(['ER0'], dtype='<U3'),
 array(['ER0'], dtype='<U3'),
 array(['ER0'], dtype='<U3'),
 array(['AH0'], dtype='<U3'),
 array(['R'], dtype='<U3'),
 array(['AH0'], dtype='<U3'),
 array(['S'], dtype='<U3')]

In [296]:
torch.save(encoder.state_dict(), '/home/maria/Documents/NLP/data/assignment_5/encoder.model')
torch.save(decoder.state_dict(), '/home/maria/Documents/NLP/data/assignment_5/decoder.model')