In [1]:
import spacy
import torch
import torch.nn as nn
from torchtext.data import Field, BucketIterator, TabularDataset
import pandas as pd
import string
from torchtext.data.metrics import bleu_score
from torch.utils.tensorboard import SummaryWriter
import random
torch.__version__

'1.6.0'

In [2]:
embedding = nn.Embedding(1, 1)
for x in embedding.parameters():
    x.requires_grad = False
for x in embedding.parameters():
    print(x)

Parameter containing:
tensor([[-1.6169]])


In [3]:
def extract_data():
    Eng_train = open('11_ds_2/train.en',encoding='utf8').read().strip().split('\n')
    Tel_train = open('11_ds_2/train.te',encoding='utf8').read().strip().split('\n')
    Eng_validation = open('11_ds_2/dev.en',encoding='utf8').read().strip().split('\n')
    Tel_validation = open('11_ds_2/dev.te',encoding='utf8').read().strip().split('\n')
    #Its stupid but this is the only way Tabular dataset allows construction of data :/
    Training_df = pd.DataFrame({'English':[text for text in Eng_train],'Telugu':[text for text in Tel_train]},columns=['English','Telugu'])
    Validation_df = pd.DataFrame({'English':[text for text in Eng_validation],'Telugu':[text for text in Tel_validation]},columns=['English','Telugu'])
    Training_df.to_csv('train.csv',index=False)
    Validation_df.to_csv('validation.csv',index=False)

In [4]:
extract_data()

In [5]:
tok_eng = spacy.load("en")
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))
def tokenize_eng(text):
    return [tok.text.lower() for tok in tok_eng.tokenizer(remove_punctuation(text))]
def tokenize_telugu(text):
    text = remove_punctuation(text)
    return text.strip().split(' ')

In [15]:
english = Field(sequential=True,use_vocab=True,tokenize=tokenize_eng,lower=True, init_token="<sos>", eos_token="<eos>")
telugu = Field(sequential=True,use_vocab=True,tokenize=tokenize_telugu, init_token="<sos>", eos_token="<eos>")
fields = {"English": ("eng", english), "Telugu": ("tel", telugu)}
train_data, test_data = TabularDataset.splits(
    path="", train="train.csv", test="validation.csv", format="csv", fields=fields
)
english.build_vocab(train_data, max_size=30000, min_freq=1,vectors="glove.6B.200d")
telugu.build_vocab(train_data, max_size=30000, min_freq=1,vectors="glove.6B.200d")
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), 
    batch_size=16, 
    device="cpu",
    sort_within_batch=True,
    sort_key=lambda x: len(x.eng),
)



In [16]:
print(telugu.vocab.stoi["<eos>"])
print(telugu.vocab.stoi["<sos>"])

3
2


In [17]:
class Encoder(nn.Module):
    def __init__(self,input_dim, embed_dim, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.input_dim = input_dim
        
        self.embedding = nn.Embedding(input_dim,embed_dim)
        self.rnn = nn.LSTM(embed_dim,hidden_dim,1)
        for x in self.embedding.parameters():
            x.requires_grad = False
    
    def forward(self, x):
        x = self.embedding(x)
        outputs ,(hidden,cell) = self.rnn(x)
        return hidden, cell

In [18]:
class Decoder(nn.Module):
    def __init__(self,input_dim,embed_dim,hidden_dim,output_size):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.input_dim = input_dim
        self.output_size = output_size
        
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim, 1)
        self.fc = nn.Linear(hidden_size, output_size)
        for x in self.embedding.parameters():
            x.requires_grad = False
    
    def forward(self, x, hidden, cell):
        x = x.unsqueeze(0)
        x = self.embedding(x)
        outputs,(hidden,cell) = self.rnn(x,(hidden,cell))
        predictions = self.fc(outputs)
        predictions = predictions.squeeze(0)
        return predictions, hidden, cell

In [19]:
class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        hidden, cell = self.encoder(source)
        x = target[0]
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = output
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

In [39]:
# Training hyperparameters
num_epochs = 8
learning_rate = 0.01
batch_size = 16

# Model hyperparameters
load_model = False
device = torch.device("cuda:0" if False else "cpu")
input_size_encoder = len(english.vocab)
input_size_decoder = len(telugu.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 200
decoder_embedding_size = 200
hidden_size = 512

encoder_net = Encoder(
    input_size_encoder, encoder_embedding_size, hidden_size
).to(device)

decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size
).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
model.encoder.embedding.weight.data.copy_(english.vocab.vectors)
model.decoder.embedding.weight.data.copy_(telugu.vocab.vectors)

#english.vocab.vectors
#telugu.vocab.vectors

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [21]:
writer = SummaryWriter(f"runs/loss_plot")
step = 0

In [22]:
def translate_sentence(model, sentence, english, telugu, device, max_length=30):
    spacy_en = spacy.load("en")
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_en(sentence)]
    else:
        tokens = [token.lower() for token in sentence]
        
    tokens.insert(0, english.init_token)
    tokens.append(english.eos_token)
    text_to_indices = [english.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    start_index = telugu.vocab.stoi["<sos>"]
    end_index = telugu.vocab.stoi["<eos>"]
    
    
    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [start_index]

    for _ in range(max_length):
        prev_pred = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(prev_pred, hidden, cell)
            pred = output.argmax(1).item()

        outputs.append(pred)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == end_index:
            #print('booyah')
            break

    translated_sentence = [telugu.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]


def bleu(data, model, english, telugu, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["eng"]
        trg = vars(example)["tel"]

        prediction = translate_sentence(model, src, english, telugu,device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

In [38]:
#if load_model:
#    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)


sentence = "I do not want to die."

for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    #checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
    #save_checkpoint(checkpoint)

    model.eval()

    translated_sentence = translate_sentence(
        model, sentence,english, telugu, device, max_length=30
    )

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    epoch_loss = 0
    x = 0.0
    
    for batch_idx, batch in enumerate(train_iterator):
        
        if(batch_idx%10 == 0):
            print('#',end="")
        
        # Get input and targets and get to cuda
        inp_data = batch.eng.to(device)
        target = batch.tel.to(device)

        # Forward prop
        output = model(inp_data, target)

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output_new = output[1:]
        output_new = output_new.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output_new, target)
        
        epoch_loss += loss.item()
        
        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1
        x += 1.0
    print('')
    print('EPOCH LOSS ',epoch_loss/x)
    print('Calculating BLEU Score...')
    score = bleu(test_data, model, english, telugu, device)
    print(f"Bleu score {score*100:.2f}")

[Epoch 0 / 8]
Translated example sentence: 
 ['నమ్మలేని', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు', 'శిశువు']
#

IndexError: Target 11538 is out of bounds.

In [33]:
output.shape

torch.Size([9, 16, 8371])

In [31]:
target.size(0)

128

In [32]:
output_new.size(1)

8371