<a href="https://colab.research.google.com/github/Ramubala/text-translation/blob/main/Seq2Seq_GRU_decompression_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#! git clone https://github.com/Ramubala/text-translation.git
#%rm -rf '/content/text-translation'

In [2]:
!pip install -q condacolab
import condacolab
condacolab.install()
!conda install cudnn=8.3.2

⏬ Downloading https://github.com/jaimergp/miniforge/releases/latest/download/Mambaforge-colab-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:33
🔁 Restarting kernel...
Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ done
Solving environment: / failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - 

In [None]:
%ls
#%cd text-translation
#!unzip eng_-french.csv.zip -d eng_-french

In [1]:
import pandas as pd
data = pd.read_csv('/content/eng_-french.csv')
data = data[:100000]
print(data.shape)
data.head()

(100000, 2)


Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [2]:
data.shape, data.columns

((100000, 2),
 Index(['English words/sentences', 'French words/sentences'], dtype='object'))

# Train a simple seq2seq model based on GRU

### Data Preparation

In [3]:
# we use spacy tokenizers
import spacy
import torchtext as tt
import collections
import numpy as np
from torchtext.vocab import vocab
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch
import torch.nn as nn

In [4]:
!spacy download en_core_web_sm
!spacy download fr_core_news_sm
english_tokenizer = spacy.load('en_core_web_sm')
french_tokenizer = spacy.load('fr_core_news_sm')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 8.2 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fr-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.4.0/fr_core_news_sm-3.4.0-py3-none-any.whl (16.3 MB)
[K     |████████████████████████████████| 16.3 MB 8.4 MB/s 
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [5]:
def make_vocab(df: pd.Series, tokenizer):
  counter_obj = collections.Counter()
  for item in df:
    line = item.strip()
    tokenized_words = [token.text for token in tokenizer(line)]
    counter_obj.update(tokenized_words)
  return vocab(counter_obj, min_freq = 1, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

In [6]:
en_vocab = make_vocab(data.loc[:,'English words/sentences'], english_tokenizer)
fr_vocab = make_vocab(data.loc[:,'French words/sentences'], french_tokenizer)

In [7]:
#train, test, val split
train, validate, test = np.split(data.sample(frac=1, random_state=42), [int(.6*len(data)), int(.8*len(data))])
train.reset_index(drop=True,inplace=True)
validate.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)

train.shape, validate.shape, test.shape

((60000, 2), (20000, 2), (20000, 2))

In [8]:
bos_id = en_vocab.__getitem__('<bos>')
eos_id = en_vocab.__getitem__('<eos>')
pad_id = en_vocab.__getitem__('<pad>')

In [9]:
eos_id, pad_id

(3, 1)

In [10]:
def prepare_source_target(df):
  tuple_list = []
  for i in range(len(df)):
    eng_sentence = df.loc[i,'English words/sentences'].strip()
    french_sentence = df.loc[i,'French words/sentences'].strip()
    src = torch.tensor([en_vocab.__getitem__(token.text) for token in english_tokenizer(eng_sentence)])
    target = torch.tensor([fr_vocab.__getitem__(token.text) for token in french_tokenizer(french_sentence)])
    tuple_list.append((src, target))
  return tuple_list

train_data = prepare_source_target(train)
val_data = prepare_source_target(validate)
test_data = prepare_source_target(test)

In [11]:
# prepare dataloaders
def collate_fn(batch_sampler_data):
  src_batch = []
  target_batch =[] 
  for i,(src, target) in enumerate(batch_sampler_data):
    src_tensor = torch.tensor(torch.cat([torch.tensor([bos_id]), src, torch.tensor([eos_id])], dim=0))
    target_tensor = torch.tensor(torch.cat([torch.tensor([bos_id]), target, torch.tensor([eos_id])], dim=0))
    src_batch.append(src_tensor)
    target_batch.append(target_tensor)
  src_batch = pad_sequence(src_batch, padding_value=pad_id)
  target_batch = pad_sequence(target_batch, padding_value=pad_id)

  return src_batch, target_batch

train_dataloader = DataLoader(train_data,batch_size=128, shuffle=True,collate_fn=collate_fn)
val_dataloader = DataLoader(val_data,batch_size=128, shuffle=True,collate_fn=collate_fn)
test_dataloader = DataLoader(test_data,batch_size=128, shuffle=True,collate_fn=collate_fn)

In [12]:
class Encoder(torch.nn.Module):
  def __init__(self, embedding_size: int, vocab_size : int, gru_layers: int):
    super().__init__()
    self.embedding_size = embedding_size
    self.hidden_size = 2*self.embedding_size
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embedding_size)
    self.GRU = nn.GRU(input_size=self.embedding_size, hidden_size=self.hidden_size,num_layers=gru_layers, bidirectional=True)
    self.l1_layer = nn.Linear(in_features=2*self.hidden_size,out_features=self.hidden_size)

  def forward(self, src_batch):
    # src_len*batch_size
    src_embedding = self.embedding(src_batch)
    #print("Encoder:: src embedding shape: {}".format(src_embedding.shape))
    # src_len*batch_size* embedding_dim
    output, hidden = self.GRU(src_embedding)
    context = torch.tanh(self.l1_layer(torch.cat((hidden[-2,:,:],hidden[-1,:,:]), dim=1)))
    #print("Encoder:: output shape: {}, hidden shape {}".format(output.shape, hidden.shape))
    return context, output

In [13]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
        
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        #repeat decoder hidden state src_len times
        
        hidden = hidden.permute(1, 0, 2)
        hidden = hidden.repeat(1, src_len, 1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch size, src len, dec hid dim]
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        
        #energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze(2)
        
        #attention= [batch size, src len]
        
        return torch.nn.functional.softmax(attention, dim=1)

In [14]:
class Decoder(torch.nn.Module):
  def __init__(self, embedding_size: int, target_vocab_size : int, gru_layers: int):
    super().__init__()
    self.embedding_size = embedding_size
    self.hidden_size = 2*self.embedding_size
    self.output_size = self.hidden_size
    self.embedding = nn.Embedding(num_embeddings=target_vocab_size, embedding_dim=self.embedding_size)
    self.GRU = nn.GRU(input_size=self.embedding_size+2*self.hidden_size, hidden_size=self.hidden_size,num_layers=gru_layers)
    self.attention_layer = Attention(self.hidden_size,self.hidden_size)
    self.output_layer = nn.Linear(in_features=self.embedding_size+3*self.hidden_size,out_features=target_vocab_size)

  def forward(self, hidden, previous_prediction, context, encoder_outputs):
    word_embedding = self.embedding(previous_prediction)
    word_embedding = torch.unsqueeze(word_embedding, dim=0)
    #print("Decoder:: src embedding shape: {}".format(word_embedding.shape))
    # 1*batch_size*embedding_length
    #print(word_embedding.shape, context.shape)
    if hidden.dim()<3:
        hidden = hidden.unsqueeze(0)
    attention = self.attention_layer(hidden, encoder_outputs)
    # batch_size*src_len
    attention = attention.unsqueeze(1)

    # batch_size*1*src_len
    encoder_outputs = encoder_outputs.permute(1, 0, 2)

    # encoder_outputs batch_size*src_len*hidden_dim
    new_context = torch.bmm(attention, encoder_outputs).permute(1,0,2)
    concat_embedding_context = torch.cat((word_embedding, new_context), dim=2)
    # 1*batch_size*(embedding_length+hidden_dim)

    
    output, hidden = self.GRU(concat_embedding_context, hidden)
    #print("Decoder:: output shape: {}, hidden shape {}".format(output.shape, hidden.shape))
    # output shape == (1,batch_size,hidden_dim)
    # hidden shape == (num_layers*num_directions,batch_size, hidden_dim)
    output = torch.concat((output, concat_embedding_context),dim=2)
    output = self.output_layer(output.squeeze(0))
    #print("Decoder:: output shape: {} after linear layer".format(output.shape))
    # 1*batch_size*target_vocab_size
    return output, hidden

In [15]:
import random
class Seq2Seq(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.encoder = Encoder(embedding_size=100, vocab_size=len(en_vocab), gru_layers=1)
    self.decoder = Decoder(embedding_size=100, target_vocab_size=len(fr_vocab), gru_layers=1)

  def forward(self, src_batch, target_batch, teacher_forcing_ratio: int):
    hidden, output = self.encoder(src_batch)
    context = hidden

    # output_shape: src_len*num_directions, batch_size, hidden_dim
    # hidden: num_layers*num_directions,batch_size, hidden_dim

    batch_size = src_batch.shape[1]
    src_length = src_batch.shape[0]
    max_len = target_batch.shape[0]

    final_decoder_output = torch.zeros(size=(max_len, batch_size, len(fr_vocab)), device= device)
    input = target_batch[0,:]
    for i in range(1,max_len):
      decoder_output, hidden = self.decoder(hidden, input, context, output)
      # decoder_output: 1*batch_size*target_vocab_size
      random_probab = random.random() > teacher_forcing_ratio
      prediction_word = decoder_output.argmax(dim=1)
      final_decoder_output[i,:,:] = decoder_output  
      input = prediction_word if random_probab else target_batch[i,:]
      input = input.squeeze()
    return final_decoder_output

In [16]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model = Seq2Seq().to(device)

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.05, 0.05)
        
model.apply(init_weights)

cuda:0


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(9644, 100)
    (GRU): GRU(100, 200, bidirectional=True)
    (l1_layer): Linear(in_features=400, out_features=200, bias=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(17025, 100)
    (GRU): GRU(500, 200)
    (attention_layer): Attention(
      (attn): Linear(in_features=600, out_features=200, bias=True)
      (v): Linear(in_features=200, out_features=1, bias=False)
    )
    (output_layer): Linear(in_features=700, out_features=17025, bias=True)
  )
)

In [17]:
# define criterion and optimizer
import torch.optim as optim

criterion = nn.CrossEntropyLoss(ignore_index = pad_id)
optimizer = optim.Adam(model.parameters())

In [18]:
def train_model(dataloader):
  clip = 0.1
  model.train()
  epoch_loss = 0

  for i, (src_batch, target_batch) in enumerate(dataloader):
    prediction = model(src_batch=src_batch.cuda(), target_batch=target_batch.cuda(), teacher_forcing_ratio=0.7)
    prediction = prediction[1:].view(-1, len(fr_vocab))
    target_batch = target_batch[1:].view(-1).cuda()
    loss = criterion(prediction, target_batch)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    epoch_loss += loss.item()
  
  return epoch_loss / len(dataloader)

In [19]:
def eval_model(dataloader):
  model.eval()
  epoch_loss = 0

  for i, (src_batch, target_batch) in enumerate(dataloader):
    prediction = model(src_batch=src_batch.cuda(), target_batch=target_batch.cuda(), teacher_forcing_ratio=0)
    prediction = prediction[1:].view(-1, len(fr_vocab))
    target_batch = target_batch[1:].view(-1).cuda()
    loss = criterion(prediction, target_batch)
    #loss.backward()
    #optimizer.step()
    epoch_loss += loss.item()
  
  return epoch_loss / len(dataloader)

In [20]:
import time
import math

N_EPOCHS = 50

best_valid_loss = float('inf')

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train_model(dataloader = train_dataloader)
    valid_loss = eval_model(dataloader = val_dataloader)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '/content/translation_model_attention_epoch_{}.pt'.format(epoch))
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

  
  import sys


Epoch: 01 | Time: 0m 45s
	Train Loss: 4.222 | Train PPL:  68.173
	 Val. Loss: 3.954 |  Val. PPL:  52.132
Epoch: 02 | Time: 0m 45s
	Train Loss: 2.576 | Train PPL:  13.140
	 Val. Loss: 3.201 |  Val. PPL:  24.559
Epoch: 03 | Time: 0m 45s
	Train Loss: 1.791 | Train PPL:   5.998
	 Val. Loss: 2.863 |  Val. PPL:  17.522
Epoch: 04 | Time: 0m 45s
	Train Loss: 1.341 | Train PPL:   3.822
	 Val. Loss: 2.765 |  Val. PPL:  15.886
Epoch: 05 | Time: 0m 45s
	Train Loss: 1.082 | Train PPL:   2.951
	 Val. Loss: 2.715 |  Val. PPL:  15.107
Epoch: 06 | Time: 0m 45s
	Train Loss: 0.891 | Train PPL:   2.437
	 Val. Loss: 2.756 |  Val. PPL:  15.738
Epoch: 07 | Time: 0m 45s
	Train Loss: 0.768 | Train PPL:   2.155
	 Val. Loss: 2.795 |  Val. PPL:  16.369
Epoch: 08 | Time: 0m 45s
	Train Loss: 0.676 | Train PPL:   1.966
	 Val. Loss: 2.850 |  Val. PPL:  17.289
Epoch: 09 | Time: 0m 45s
	Train Loss: 0.608 | Train PPL:   1.836
	 Val. Loss: 2.870 |  Val. PPL:  17.630
Epoch: 10 | Time: 0m 45s
	Train Loss: 0.556 | Train PPL

KeyboardInterrupt: ignored

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')