<a href="https://colab.research.google.com/github/Ramubala/text-translation/blob/main/Seq2Seq_GRU_decompression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#! git clone https://github.com/Ramubala/text-translation.git
#%rm -rf '/content/text-translation'

In [2]:
# !pip install -q condacolab
# import condacolab
# condacolab.install()
# !conda install cudnn=8.3.2

In [3]:
%ls
#%cd text-translation
#!unzip eng_-french.csv.zip -d eng_-french

eng_-french.csv  [0m[01;34msample_data[0m/  translation_model_epoch_7.pt


In [4]:
import pandas as pd
data = pd.read_csv('/content/eng_-french.csv')
data = data[:100000]
print(data.shape)
data.head()

(100000, 2)


Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [5]:
data.shape, data.columns

((100000, 2),
 Index(['English words/sentences', 'French words/sentences'], dtype='object'))

# Train a simple seq2seq model based on GRU

### Data Preparation

In [6]:
# we use spacy tokenizers
import spacy
import torchtext as tt
import collections
import numpy as np
from torchtext.vocab import vocab
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch
import torch.nn as nn

In [7]:
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm
english_tokenizer = spacy.load('en_core_web_sm')
french_tokenizer = spacy.load('fr_core_news_sm')

2022-09-23 03:15:46.906189: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 10.8 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
2022-09-23 03:16:00.242115: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fr-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.

In [8]:
def make_vocab(df: pd.Series, tokenizer):
  counter_obj = collections.Counter()
  for item in df:
    line = item.strip()
    tokenized_words = [token.text for token in tokenizer(line)]
    counter_obj.update(tokenized_words)
  return vocab(counter_obj, min_freq = 1, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

In [9]:
en_vocab = make_vocab(data.loc[:,'English words/sentences'], english_tokenizer)
fr_vocab = make_vocab(data.loc[:,'French words/sentences'], french_tokenizer)

In [10]:
#train, test, val split
train, validate, test = np.split(data.sample(frac=1, random_state=42), [int(.6*len(data)), int(.8*len(data))])
train.reset_index(drop=True,inplace=True)
validate.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)

train.shape, validate.shape, test.shape

((60000, 2), (20000, 2), (20000, 2))

In [11]:
bos_id = en_vocab.__getitem__('<bos>')
eos_id = en_vocab.__getitem__('<eos>')
pad_id = en_vocab.__getitem__('<pad>')

In [12]:
eos_id, pad_id

(3, 1)

In [13]:
def prepare_source_target(df):
  tuple_list = []
  for i in range(len(df)):
    eng_sentence = df.loc[i,'English words/sentences'].strip()
    french_sentence = df.loc[i,'French words/sentences'].strip()
    src = torch.tensor([en_vocab.__getitem__(token.text) for token in english_tokenizer(eng_sentence)])
    target = torch.tensor([fr_vocab.__getitem__(token.text) for token in french_tokenizer(french_sentence)])
    tuple_list.append((src, target))
  return tuple_list

train_data = prepare_source_target(train)
val_data = prepare_source_target(validate)
test_data = prepare_source_target(test)

In [14]:
# prepare dataloaders
def collate_fn(batch_sampler_data):
  src_batch = []
  target_batch =[] 
  for i,(src, target) in enumerate(batch_sampler_data):
    src_tensor = torch.tensor(torch.cat([torch.tensor([bos_id]), src, torch.tensor([eos_id])], dim=0))
    target_tensor = torch.tensor(torch.cat([torch.tensor([bos_id]), target, torch.tensor([eos_id])], dim=0))
    src_batch.append(src_tensor)
    target_batch.append(target_tensor)
  src_batch = pad_sequence(src_batch, padding_value=pad_id)
  target_batch = pad_sequence(target_batch, padding_value=pad_id)

  return src_batch, target_batch

train_dataloader = DataLoader(train_data,batch_size=128, shuffle=True,collate_fn=collate_fn)
val_dataloader = DataLoader(val_data,batch_size=128, shuffle=True,collate_fn=collate_fn)
test_dataloader = DataLoader(test_data,batch_size=128, shuffle=True,collate_fn=collate_fn)

In [15]:
class Encoder(torch.nn.Module):
  def __init__(self, embedding_size: int, vocab_size : int, gru_layers: int):
    super().__init__()
    self.embedding_size = embedding_size
    self.hidden_size = 2*self.embedding_size
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embedding_size)
    self.GRU = nn.GRU(input_size=self.embedding_size, hidden_size=self.hidden_size,num_layers=gru_layers)

  def forward(self, src_batch):
    # src_len*batch_size
    src_embedding = self.embedding(src_batch)
    #print("Encoder:: src embedding shape: {}".format(src_embedding.shape))
    # src_len*batch_size* embedding_dim
    output, hidden = self.GRU(src_embedding)
    #print("Encoder:: output shape: {}, hidden shape {}".format(output.shape, hidden.shape))
    return hidden, output

In [16]:
class Decoder(torch.nn.Module):
  def __init__(self, embedding_size: int, target_vocab_size : int, gru_layers: int):
    super().__init__()
    self.embedding_size = embedding_size
    self.hidden_size = 2*self.embedding_size
    self.output_size = self.hidden_size
    self.embedding = nn.Embedding(num_embeddings=target_vocab_size, embedding_dim=self.embedding_size)
    self.GRU = nn.GRU(input_size=self.embedding_size+self.hidden_size, hidden_size=self.hidden_size,num_layers=gru_layers)
    self.output_layer = nn.Linear(in_features=self.embedding_size+2*self.hidden_size,out_features=target_vocab_size)

  def forward(self, hidden, previous_prediction, context):
    word_embedding = self.embedding(previous_prediction)
    word_embedding = torch.unsqueeze(word_embedding, dim=0)
    #print("Decoder:: src embedding shape: {}".format(word_embedding.shape))
    # 1*batch_size*embedding_length
    #print(word_embedding.shape, context.shape)
    concat_embedding_context = torch.cat((word_embedding, context), dim=2)
    # 1*batch_size*(embedding_length+hidden_dim)
    output, hidden = self.GRU(concat_embedding_context, hidden)
    #print("Decoder:: output shape: {}, hidden shape {}".format(output.shape, hidden.shape))
    # output shape == (1,batch_size,hidden_dim)
    # hidden shape == (num_layers*num_directions,batch_size, hidden_dim)
    output = torch.concat((output, concat_embedding_context),dim=2)
    output = self.output_layer(output.squeeze(0))
    #print("Decoder:: output shape: {} after linear layer".format(output.shape))
    # 1*batch_size*target_vocab_size
    return output, hidden

In [17]:
import random
class Seq2Seq(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.encoder = Encoder(embedding_size=100, vocab_size=len(en_vocab), gru_layers=1)
    self.decoder = Decoder(embedding_size=100, target_vocab_size=len(fr_vocab), gru_layers=1)

  def forward(self, src_batch, target_batch, teacher_forcing_ratio: int):
    hidden, output = self.encoder(src_batch)
    context = hidden

    # output_shape: src_len*num_directions, batch_size, hidden_dim
    # hidden: num_layers*num_directions,batch_size, hidden_dim

    batch_size = src_batch.shape[1]
    src_length = src_batch.shape[0]
    max_len = target_batch.shape[0]

    final_decoder_output = torch.zeros(size=(max_len, batch_size, len(fr_vocab)), device= device)
    input = target_batch[0,:]
    for i in range(1,max_len):
      decoder_output, hidden = self.decoder(hidden, input, context)
      # decoder_output: 1*batch_size*target_vocab_size
      random_probab = random.random() > teacher_forcing_ratio
      prediction_word = decoder_output.argmax(dim=1)
      final_decoder_output[i,:,:] = decoder_output  
      input = prediction_word if random_probab else target_batch[i,:]
      input = input.squeeze()
    return final_decoder_output

In [20]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model = Seq2Seq().to(device)

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.05, 0.05)
        
model.apply(init_weights)

cpu


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(9644, 100)
    (GRU): GRU(100, 200)
  )
  (decoder): Decoder(
    (embedding): Embedding(17025, 100)
    (GRU): GRU(300, 200)
    (output_layer): Linear(in_features=500, out_features=17025, bias=True)
  )
)

In [None]:
# define criterion and optimizer
import torch.optim as optim

criterion = nn.CrossEntropyLoss(ignore_index = pad_id)
optimizer = optim.Adam(model.parameters())

In [None]:
def train_model(dataloader):
  clip = 0.1
  model.train()
  epoch_loss = 0

  for i, (src_batch, target_batch) in enumerate(dataloader):
    prediction = model(src_batch=src_batch.cuda(), target_batch=target_batch.cuda(), teacher_forcing_ratio=0.7)
    prediction = prediction[1:].view(-1, len(fr_vocab))
    target_batch = target_batch[1:].view(-1).cuda()
    loss = criterion(prediction, target_batch)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    epoch_loss += loss.item()
  
  return epoch_loss / len(dataloader)

In [None]:
def eval_model(dataloader):
  model.eval()
  epoch_loss = 0

  for i, (src_batch, target_batch) in enumerate(dataloader):
    prediction = model(src_batch=src_batch.cuda(), target_batch=target_batch.cuda(), teacher_forcing_ratio=0)
    prediction = prediction[1:].view(-1, len(fr_vocab))
    target_batch = target_batch[1:].view(-1).cuda()
    loss = criterion(prediction, target_batch)
    #loss.backward()
    #optimizer.step()
    epoch_loss += loss.item()
  
  return epoch_loss / len(dataloader)

In [None]:
import time
import math

N_EPOCHS = 50

best_valid_loss = float('inf')

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train_model(dataloader = train_dataloader)
    valid_loss = eval_model(dataloader = val_dataloader)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '/content/translation_model_epoch_{}.pt'.format(epoch))
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

  
  import sys


Epoch: 01 | Time: 0m 33s
	Train Loss: 4.519 | Train PPL:  91.705
	 Val. Loss: 4.481 |  Val. PPL:  88.338
Epoch: 02 | Time: 0m 33s
	Train Loss: 3.174 | Train PPL:  23.906
	 Val. Loss: 4.102 |  Val. PPL:  60.439
Epoch: 03 | Time: 0m 33s
	Train Loss: 2.560 | Train PPL:  12.936
	 Val. Loss: 3.700 |  Val. PPL:  40.455
Epoch: 04 | Time: 0m 33s
	Train Loss: 2.072 | Train PPL:   7.942
	 Val. Loss: 3.430 |  Val. PPL:  30.864
Epoch: 05 | Time: 0m 33s
	Train Loss: 1.697 | Train PPL:   5.457
	 Val. Loss: 3.239 |  Val. PPL:  25.511
Epoch: 06 | Time: 0m 33s
	Train Loss: 1.395 | Train PPL:   4.035
	 Val. Loss: 3.159 |  Val. PPL:  23.540
Epoch: 07 | Time: 0m 34s
	Train Loss: 1.180 | Train PPL:   3.256
	 Val. Loss: 3.120 |  Val. PPL:  22.642
Epoch: 08 | Time: 0m 34s
	Train Loss: 1.006 | Train PPL:   2.735
	 Val. Loss: 3.106 |  Val. PPL:  22.322
Epoch: 09 | Time: 0m 34s
	Train Loss: 0.888 | Train PPL:   2.431
	 Val. Loss: 3.124 |  Val. PPL:  22.729
Epoch: 10 | Time: 0m 34s
	Train Loss: 0.787 | Train PPL

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-210-c80e6b670f66>", line 18, in <module>
    train_loss = train_model(dataloader = train_dataloader)
  File "<ipython-input-208-617dab170ab4>", line 14, in train_model
    epoch_loss += loss.item()
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2040, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fixed_getinnerfram

KeyboardInterrupt: ignored

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [23]:
model.load_state_dict(torch.load('/content/translation_model_epoch_7.pt',map_location=torch.device('cpu')))

def translate(dataloader):
   model.eval()
   translations=[]
   for i, (src_batch, target_batch) in enumerate(dataloader):
        x = model(src_batch, target_batch, 0)
        batch_size = x.size()[1]
        #x = .view(-1, batch_size, len(fr_vocab))
        src_batch = src_batch[1:-1].permute(1,0)
        target_batch = target_batch[1:-1].permute(1,0)
        prediction_word = x[1:-1].argmax(dim=2).permute(1,0)
        preds = []
        for i in range(prediction_word.shape[0]):
          output = prediction_word[i,:]
          input = src_batch[i,:]
          target = target_batch[i,:]
          input = [en_vocab.vocab.get_itos()[item] for item in input]
          output = [fr_vocab.vocab.get_itos()[item] for item in output]
          target = [fr_vocab.vocab.get_itos()[item] for item in target]
          output = output[: output.index('<eos>')] if '<eos>' in output else output
          input = input[: input.index('<eos>')] if '<eos>' in input else input
          target = target[: target.index('<eos>')] if '<eos>' in target else target
          translations.append((" ".join(input), " ".join(output) ," ".join(target)))
          #print((" ".join(input), " ".join(output) ," ".join(target)))
   return translations 

translations = translate(test_dataloader)

  
  import sys


In [24]:
from torchtext.data.metrics import bleu_score
bleu_score([x[1].split(" ") for x in translations], [[x[2].split(" ")] for x in translations])

0.2948518090110107