<a href="https://colab.research.google.com/github/Ramubala/text-translation/blob/main/Simple_seq2seq_GRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! git clone https://github.com/Ramubala/text-translation.git

Cloning into 'simple-image-recognition'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 8 (delta 1), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (8/8), done.


In [None]:
%ls
%cd simple-image-recognition
#!unzip eng_-french.csv.zip -d eng_-french

eng_-french.csv  LICENSE  README.md  [0m[01;34msimple-image-recognition[0m/  tut1-model.pt
/content/simple-image-recognition/simple-image-recognition


In [None]:
import pandas as pd
data = pd.read_csv('eng_-french.csv')
data = data[:100000]
data.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [None]:
data.shape, data.columns

((100000, 2),
 Index(['English words/sentences', 'French words/sentences'], dtype='object'))

# Train a simple seq2seq model based on GRU

### Data Preparation

In [None]:
# we use spacy tokenizers
import spacy
import torchtext as tt
import collections
import numpy as np
from torchtext.vocab import vocab
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch
import torch.nn as nn

In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm
english_tokenizer = spacy.load('en_core_web_sm')
french_tokenizer = spacy.load('fr_core_news_sm')

2022-09-07 16:52:27.507864: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 5.1 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
2022-09-07 16:52:37.348723: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fr-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.4

In [None]:
def make_vocab(df: pd.Series, tokenizer):
  counter_obj = collections.Counter()
  for item in df:
    line = item.strip()
    tokenized_words = [token.text for token in tokenizer(line)]
    counter_obj.update(tokenized_words)
  return vocab(counter_obj, min_freq = 1, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

In [None]:
en_vocab = make_vocab(data.loc[:,'English words/sentences'], english_tokenizer)
fr_vocab = make_vocab(data.loc[:,'French words/sentences'], french_tokenizer)

In [None]:
#train, test, val split
train, validate, test = np.split;(data.sample(frac=1, random_state=42), [int(.6*len(data)), int(.8*len(data))])
train.reset_index(drop=True,inplace=True)
validate.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)

train.shape, validate.shape, test.shape

((60000, 2), (20000, 2), (20000, 2))

In [None]:
bos_id = en_vocab.__getitem__('<bos>')
eos_id = en_vocab.__getitem__('<eos>')
pad_id = en_vocab.__getitem__('<pad>')

In [None]:
eos_id, pad_id

(3, 1)

In [None]:
def prepare_source_target(df):
  tuple_list = []
  for i in range(len(df)):
    eng_sentence = df.loc[i,'English words/sentences'].strip()
    french_sentence = df.loc[i,'French words/sentences'].strip()
    src = torch.tensor([en_vocab.__getitem__(token.text) for token in english_tokenizer(eng_sentence)])
    target = torch.tensor([fr_vocab.__getitem__(token.text) for token in french_tokenizer(french_sentence)])
    tuple_list.append((src, target))
  return tuple_list

train_data = prepare_source_target(train)
val_data = prepare_source_target(validate)
test_data = prepare_source_target(test)

In [None]:
# prepare dataloaders
def collate_fn(batch_sampler_data):
  src_batch = []
  target_batch =[] 
  for i,(src, target) in enumerate(batch_sampler_data):
    src_tensor = torch.tensor(torch.cat([torch.tensor([bos_id]), src, torch.tensor([eos_id])], dim=0))
    target_tensor = torch.tensor(torch.cat([torch.tensor([bos_id]), target, torch.tensor([eos_id])], dim=0))
    src_batch.append(src_tensor)
    target_batch.append(target_tensor)
  src_batch = pad_sequence(src_batch, padding_value=pad_id)
  target_batch = pad_sequence(target_batch, padding_value=pad_id)

  return src_batch, target_batch

train_dataloader = DataLoader(train_data,batch_size=128, shuffle=True,collate_fn=collate_fn)
val_dataloader = DataLoader(val_data,batch_size=128, shuffle=True,collate_fn=collate_fn)
test_dataloader = DataLoader(test_data,batch_size=128, shuffle=True,collate_fn=collate_fn)

In [None]:
class Encoder(torch.nn.Module):
  def __init__(self, embedding_size: int, vocab_size : int, gru_layers: int):
    super().__init__()
    self.embedding_size = embedding_size
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embedding_size)
    self.GRU = nn.GRU(input_size=self.embedding_size, hidden_size=self.embedding_size,num_layers=gru_layers)

  def forward(self, src_batch):
    # src_len*batch_size
    src_embedding = self.embedding(src_batch)
    #print("Encoder:: src embedding shape: {}".format(src_embedding.shape))
    # src_len*batch_size* embedding_dim
    output, hidden = self.GRU(src_embedding)
    #print("Encoder:: output shape: {}, hidden shape {}".format(output.shape, hidden.shape))
    return hidden, output

In [None]:
class Decoder(torch.nn.Module):
  def __init__(self, embedding_size: int, target_vocab_size : int, gru_layers: int):
    super().__init__()
    self.embedding_size = embedding_size
    self.hidden_size = self.embedding_size
    self.output_size = self.hidden_size
    self.embedding = nn.Embedding(num_embeddings=target_vocab_size, embedding_dim=self.embedding_size)
    self.GRU = nn.GRU(input_size=self.embedding_size, hidden_size=self.hidden_size,num_layers=gru_layers)
    self.output_layer = nn.Linear(in_features=self.hidden_size,out_features=target_vocab_size)

  def forward(self, hidden, previous_prediction):
    word_embedding = self.embedding(previous_prediction)
    word_embedding = torch.unsqueeze(word_embedding, dim=0)
    #print("Decoder:: src embedding shape: {}".format(word_embedding.shape))
    # 1*batch_size*embedding_length
    output, hidden = self.GRU(word_embedding, hidden)
    #print("Decoder:: output shape: {}, hidden shape {}".format(output.shape, hidden.shape))
    # output shape == (1,batch_size,hidden_dim)
    # hidden shape == (num_layers*num_directions,batch_size, hidden_dim)
    output = self.output_layer(output.squeeze(0))
    #print("Decoder:: output shape: {} after linear layer".format(output.shape))
    # 1*batch_size*target_vocab_size
    return output, hidden

In [None]:
import random
class Seq2Seq(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.encoder = Encoder(embedding_size=100, vocab_size=len(en_vocab), gru_layers=2)
    self.decoder = Decoder(embedding_size=100, target_vocab_size=len(fr_vocab), gru_layers=2)

  def forward(self, src_batch, target_batch, teacher_forcing_ratio: int):
    hidden, output = self.encoder(src_batch)

    # output_shape: src_len*num_directions, batch_size, hidden_dim
    # hidden: num_layers*num_directions,batch_size, hidden_dim

    batch_size = src_batch.shape[1]
    src_length = src_batch.shape[0]
    max_len = target_batch.shape[0]

    final_decoder_output = torch.zeros(size=(max_len, batch_size, len(fr_vocab)), device= device)
    input = target_batch[0,:]
    for i in range(1,max_len):
      decoder_output, hidden = self.decoder(hidden, input)
      # decoder_output: 1*batch_size*target_vocab_size
      random_probab = random.random() > teacher_forcing_ratio
      prediction_word = decoder_output.argmax(dim=1)
      final_decoder_output[i,:,:] = decoder_output  
      input = prediction_word if random_probab else target_batch[i,:]
      input = input.squeeze()
    return final_decoder_output

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#print(device)
model = Seq2Seq().to(device)

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.05, 0.05)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(9644, 100)
    (GRU): GRU(100, 100, num_layers=2)
  )
  (decoder): Decoder(
    (embedding): Embedding(17025, 100)
    (GRU): GRU(100, 100, num_layers=2)
    (output_layer): Linear(in_features=100, out_features=17025, bias=True)
  )
)

In [None]:
# define criterion and optimizer
import torch.optim as optim

criterion = nn.CrossEntropyLoss(ignore_index = pad_id)
optimizer = optim.Adam(model.parameters())

In [None]:
def train_model(dataloader):
  model.train()
  epoch_loss = 0

  for i, (src_batch, target_batch) in enumerate(dataloader):
    prediction = model(src_batch=src_batch, target_batch=target_batch, teacher_forcing_ratio=0.7)
    prediction = prediction[1:].view(-1, len(fr_vocab))
    target_batch = target_batch[1:].view(-1)
    loss = criterion(prediction, target_batch)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
  
  return epoch_loss / len(dataloader)

In [None]:
def eval_model(dataloader):
  model.eval()
  epoch_loss = 0

  for i, (src_batch, target_batch) in enumerate(dataloader):
    prediction = model(src_batch=src_batch, target_batch=target_batch, teacher_forcing_ratio=0)
    prediction = prediction[1:].view(-1, len(fr_vocab))
    target_batch = target_batch[1:].view(-1)
    loss = criterion(prediction, target_batch)
    #loss.backward()
    #optimizer.step()
    epoch_loss += loss.item()
  
  return epoch_loss / len(dataloader)

In [None]:
import time
import math

N_EPOCHS = 10

best_valid_loss = float('inf')

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train_model(dataloader = train_dataloader)
    valid_loss = eval_model(dataloader = val_dataloader)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

  
  import sys


Epoch: 01 | Time: 13m 22s
	Train Loss: 12.594 | Train PPL: 294899.317
	 Val. Loss: 21.323 |  Val. PPL: 1821281801.282
