<a href="https://colab.research.google.com/github/Parinita-Jain/NLP/blob/main/RussianToEnglishTOrchText0_6_0_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
#!pip install torchtext==0.6.0

Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m61.4/64.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from torchtext==0.6.0)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, torchtext
Successfully installed sentencepiece-0.1.99 torchtext-0.6.0


In [1]:
import re
import time
import math
import random

import numpy as np
import pandas as pd
import spacy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data

from tqdm import notebook
pd.set_option('display.max_colwidth', 200)
# check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [2]:

# read dataset from the Google drive
df = pd.read_csv("nmt_data.csv")
test_df = pd.read_csv("nmt_data_test.csv")

# shape of datasets
df.shape, test_df.shape

((187053, 2), (46668, 2))

In [3]:
from spacy.lang.ru import Russian

In [4]:

# dependency for spaCy Russian tokenizer
!pip install pymorphy2



In [5]:

# spacy object for Russian
nlp_ru = Russian()

# spacy object for English
nlp_en = spacy.load("en_core_web_sm", disable = ["parser", "tagger", "ner"])

In [6]:
## functions to perform tokenization

# tokenizes Russian text from a string into a list of tokens
def tokenize_ru(text):
  return [tok.text for tok in nlp_ru.tokenizer(text)]

# tokenizes English text from a string into a list of tokens
def tokenize_en(text):
  return [tok.text for tok in nlp_en.tokenizer(text)]

In [7]:
## Create Field objects

# Field object for Russian
SRC = data.Field(tokenize = tokenize_ru,
                 include_lengths = True,
                 lower = True)

# Field object for English
TRG = data.Field(tokenize = tokenize_en,
                 init_token = '', # "start" token
                 eos_token = '', # "" token
                 include_lengths = True,
                 lower = True)

fields = [('rus', SRC), ('eng', TRG)]

In [8]:
nmt_data = data.TabularDataset(path="nmt_data.csv", format='csv', fields=fields)

In [9]:
# build vocabulary for Russian sequences
SRC.build_vocab(nmt_data, max_size=4000)

# build vocabulary for English sequences
TRG.build_vocab(nmt_data, max_size=4000)

In [10]:
len(SRC.vocab), len(TRG.vocab)

(4002, 4003)

In [11]:

# special tokens in input sequences (Russian)
SRC.vocab.itos[0], SRC.vocab.itos[1]

('<unk>', '<pad>')

In [12]:
# Split our dialogue data into training, validation, and test sets
train_data, val_data = nmt_data.split(split_ratio=0.8)

In [13]:

# Create a set of iterators for each split
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, val_data),
    batch_size = 64,
    sort_within_batch = True,
    sort_key = lambda x:len(x.rus),
    device = device)

In [14]:

class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()

    self.hid_dim = hid_dim
    self.n_layers = n_layers

    self.embedding = nn.Embedding(input_dim, emb_dim)

    self.gru = nn.GRU(emb_dim, hid_dim, n_layers, dropout = dropout)

  def forward(self, src):

    #src: [src len, batch size]

    embedded = self.embedding(src)

    #embedded: [src len, batch size, emb dim]

    outputs, hidden = self.gru(embedded)

    #outputs: [src len, batch size, hid dim]
    #hidden: [n layers, batch size, hid dim]

    return hidden

In [15]:
## embedding layer:
##    input dimensions = output_dim (size of English vocabulary),
##    ouput dimensions = emb_dim

## GRU layer:
##    input dimensions = emb_dim
##    hidden units = hid_dim
##    layers = n_layers
##    output dim = hid_dim

## Fully Connected layer:
##    input dimensions = hid_dim,
##    ouput dimensions = output_dim (size of English vocabulary)

class Decoder(nn.Module):
  def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()

    self.output_dim = output_dim
    self.hid_dim = hid_dim
    self.n_layers = n_layers

    self.embedding = nn.Embedding(output_dim, emb_dim)

    self.gru = nn.GRU(emb_dim, hid_dim, n_layers, dropout = dropout)

    self.fc_out = nn.Linear(hid_dim, output_dim)

  def forward(self, input, hidden):

    input = input.unsqueeze(0)
        #input = [1, batch size]

    embedded = self.embedding(input)

    #embedded = [1, batch size, emb dim]

    output, hidden = self.gru(embedded, hidden)

    #output = [seq len, batch size, hid dim]
    #hidden = [n layers, batch size, hid dim]

    #seq len will always be 1 in the decoder, therefore, output = [1, batch size, hid dim]

    prediction = self.fc_out(output.squeeze(0))

    #prediction = [batch size, output dim]

    return prediction, hidden


In [16]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()

    self.encoder = encoder
    self.decoder = decoder
    self.device = device

  def forward(self, src, trg):

    #src = [src len, batch size]
    #trg = [trg len, batch size]

    batch_size = trg.shape[1]
    trg_len = trg.shape[0]
    trg_vocab_size = self.decoder.output_dim

    # tensor to store decoder outputs
    outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

    # last hidden state of the encoder is used as the initial hidden state of the decoder
    hidden = self.encoder(src)

    # first input to the decoder is the  tokens
    input = trg[0,:]
    for t in range(1, trg_len):

      # insert input token embedding, previous hidden state
      # receive output tensor (predictions) and new hidden state
      output, hidden = self.decoder(input, hidden)

      # place predictions in a tensor holding predictions for each token
      outputs[t] = output
      input = trg[t,:]

    return outputs



In [17]:
#set hyperparameters
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 256
N_LAYERS = 2
ENC_DROPOUT = 0.3
DEC_DROPOUT = 0.3

# instantiate Encoder and Decoder
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

# instantiate Sequence-to-Sequence Model
model = Seq2Seq(enc, dec, device).to(device)

In [18]:
# Adam optimizer
optimizer = optim.Adam(model.parameters())

# pad token index
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

# cross entropy loss with softmax
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [19]:
def train(model, iterator, optimizer, criterion):

  model.train()

  epoch_loss = 0

  for i, batch in notebook.tqdm(enumerate(iterator)):

    # set accumulated loss to zero
    optimizer.zero_grad()

    # get integer sequences (tensors)
    src = batch.rus[0]
    trg = batch.eng[0]

    # pass Russian tensor batch to the sequence-to-sequence model
    output = model(src, trg)

    #trg = [trg len, batch size]
    #output = [trg len, batch size, output dim]

    output_dim = output.shape[-1]

    output = output[1:].view(-1, output_dim)
    #output = [(trg len - 1) * batch size, output dim]

    trg = trg[1:].view(-1)
    #trg = [(trg len - 1) * batch size]

    # compute loss
    loss = criterion(output, trg)

    # backpropagate lossb
    loss.backward()

    # update weights
    optimizer.step()

    epoch_loss += loss.item()

  return epoch_loss / len(iterator)




In [21]:
def evaluate(model, iterator, criterion):

  model.eval()

  epoch_loss = 0

  with torch.no_grad():

    for i, batch in enumerate(iterator):

      # get integer sequences (tensors)
      src = batch.rus[0]
      trg = batch.eng[0]

      output = model(src, trg)

      #trg = [trg len, batch size]
      #output = [trg len, batch size, output dim]

      output_dim = output.shape[-1]

      output = output[1:].view(-1, output_dim)
      trg = trg[1:].view(-1)

      #trg = [(trg len - 1) * batch size]
      #output = [(trg len - 1) * batch size, output dim]
      loss = criterion(output, trg)

      epoch_loss += loss.item()

  return epoch_loss / len(iterator)



In [22]:
# function to compute time taken by an epoch (in mm:ss)
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs


In [23]:
N_EPOCHS = 10

best_valid_loss = float('inf')

# start model training
for epoch in range(N_EPOCHS):

  start_time = time.time()

  train_loss = train(model, train_iterator, optimizer, criterion)
  valid_loss = evaluate(model, valid_iterator, criterion)

  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  # compare validation loss
  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'best_model.pt')

  print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')


0it [00:00, ?it/s]

Epoch: 01 | Time: 0m 51s
	Train Loss: 2.911 | Train PPL:  18.383
	 Val. Loss: 2.048 |  Val. PPL:   7.755


0it [00:00, ?it/s]

Epoch: 02 | Time: 0m 50s
	Train Loss: 1.793 | Train PPL:   6.007
	 Val. Loss: 1.598 |  Val. PPL:   4.943


0it [00:00, ?it/s]

Epoch: 03 | Time: 0m 52s
	Train Loss: 1.441 | Train PPL:   4.226
	 Val. Loss: 1.421 |  Val. PPL:   4.141


0it [00:00, ?it/s]

Epoch: 04 | Time: 0m 48s
	Train Loss: 1.257 | Train PPL:   3.515
	 Val. Loss: 1.324 |  Val. PPL:   3.758


0it [00:00, ?it/s]

Epoch: 05 | Time: 0m 48s
	Train Loss: 1.141 | Train PPL:   3.131
	 Val. Loss: 1.285 |  Val. PPL:   3.615


0it [00:00, ?it/s]

Epoch: 06 | Time: 0m 51s
	Train Loss: 1.061 | Train PPL:   2.888
	 Val. Loss: 1.249 |  Val. PPL:   3.486


0it [00:00, ?it/s]

Epoch: 07 | Time: 0m 49s
	Train Loss: 0.999 | Train PPL:   2.716
	 Val. Loss: 1.233 |  Val. PPL:   3.431


0it [00:00, ?it/s]

Epoch: 08 | Time: 0m 48s
	Train Loss: 0.950 | Train PPL:   2.586
	 Val. Loss: 1.223 |  Val. PPL:   3.398


0it [00:00, ?it/s]

Epoch: 09 | Time: 0m 48s
	Train Loss: 0.911 | Train PPL:   2.486
	 Val. Loss: 1.215 |  Val. PPL:   3.371


0it [00:00, ?it/s]

Epoch: 10 | Time: 0m 46s
	Train Loss: 0.878 | Train PPL:   2.406
	 Val. Loss: 1.208 |  Val. PPL:   3.346


In [24]:

# load saved model weights
path = 'best_model.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [25]:
# function to perform translation
def translate_sentence(sentence, model):

  # set model at evaluation modeb
  model.eval()

  # empty list to keep input sequence tokensb
  token_int = []

  # iterate over the input sequence
  doc = nlp_ru(sentence)
  for i in doc:
    # convert tokens to
    token_int.append(SRC.vocab.stoi[i.text])

  # convert list to a PyTorch tensor
  token_int = torch.tensor([token_int]).to(device)
  token_int = token_int.reshape(-1,1)

  # pass the tensor to the encoder and get the context vector (hidden)
  hidden = model.encoder(token_int)

  # initialize the list with the start token's index
  trg_indexes = [TRG.vocab.stoi[TRG.init_token]]

  pred_token = TRG.vocab.stoi[TRG.init_token]

  while pred_token != TRG.vocab.stoi[TRG.eos_token]:
    trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
    with torch.no_grad():
      # pass the context vector (hidden) to the decoder
      output, hidden = model.decoder(trg_tensor, hidden)

    # get index of the largest value
    pred_token = output.argmax(1).item()
    trg_indexes.append(pred_token)

  # covert integers to tokens
  trg_tokens = [TRG.vocab.itos[i] for i in trg_indexes]

  return " ".join(trg_tokens[1:-1])



In [26]:

# actual translation "is it working"
sent = "это работает"
translate_sentence(sent, model)

''

In [27]:
translations = [translate_sentence(sent, model) for sent in notebook.tqdm(test_df["rus"])]

  0%|          | 0/46668 [00:00<?, ?it/s]

In [28]:
#add translations to the test dataframe
test_df["translations"] = translations

In [29]:
test_df.sample(20)

Unnamed: 0,rus,eng,translations
13955,не умирай том,tom don't die,
27618,я только что это купил,i just bought this,
39226,ты смотришь телевизор каждый день,do you watch tv every day,
45117,я не понял анекдота,i didn't understand the joke,
2341,я жду своего молодого человека,i'm waiting for my boyfriend,
19807,между двумя странами разразилась война,a war broke out between the two countries,
45493,здесь том и живёт,that's where tom lives,
16319,том с мэри могли расстаться,tom and mary might break up,
31809,дом тома сгорел дотла,tom's house burned to the ground,
37762,я верю том невиновен,i believe tom is innocent,
