In [1]:
from preprocess import Preprocess
import parameters as PRM
import utils
import pandas as pd
import random
import torch
from torch import nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
p = Preprocess()
p.load_wiki(nb_pair_sentences=30)
p.load_fasttext()
print(p.sentences[PRM.TARGET])

0     he was chairman of the board of governors at t...
1     his was honoured with the naming of the anthon...
2     director d  w  griffith was central to the dev...
3          some markets lacked sound equipped theaters 
4     one reason this was possible is that  with so ...
5     it was expected that a great deal of money was...
6                                                   ed 
7     each prefix has a unique symbol that is prepen...
8              the binary multiple  $     is close to  
9     their adoption in popular publications remains...
10    however  although some of these are repeated o...
11    this is a list of company names with their nam...
12      $ th century fox   film studio  formed in  $...
13     abn amro   in the  $ s  the nederlandsche han...
14    before  $  january  $   the company was called...
15     alza   from the name of the founder alex zaff...
16     amazon com   founder jeff bezos renamed the c...
17    the name is a contraction of alan michael 

In [3]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [9]:
def prepareData():
    input_lang = Lang('en')
    output_lang = Lang('de')
    pairs = []
    for i in range(len(p.sentences)):
        pair = []
        english_sentence = p.sentences.iloc[i][PRM.TARGET]
        german_sentence = p.sentences.iloc[i][PRM.SOURCE]
        output_lang.addSentence(german_sentence)
        input_lang.addSentence(english_sentence)
        pair.append(english_sentence)
        pair.append(german_sentence)
        pairs.append(pair)
    return input_lang, output_lang, pairs
input_lang, output_lang, pairs = prepareData()
print('Number of sentences pairs in the corpus:')
print(len(p.sentences))
print()
print('Random sentence pair in the corpus:')
print(random.choice(pairs))
print()
print('Number of words for each langage in the corpus:')
print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)

Number of sentences pairs in the corpus:
30

Random sentence pair in the corpus:
['the partners wanted to use the name big tex  but were unsuccessful in negotiating with the akron businessman who was already using the name ', 'ihre partner wollten den namen  big tex  verwenden  waren aber nicht erfolgreich in der verhandlung mit einem akroner geschäftsmann  der den namen bereits benutzte ']

Number of words for each langage in the corpus:
en 272
de 275


In [20]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [21]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)