<a href="https://colab.research.google.com/github/ParijatSutradhar04/ML-learning/blob/main/Seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'bilingual-sentence-pairs:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1118439%2F1878727%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240628%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240628T124106Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D0ec9650f8067fda440f9f8199e902018b7441be9baa153546b509ec4f0732d1dd7af68e4d1738d5a4845166ad84a566bfba5dd491ad09b49dbb358a085c4dc2d65255abfba6dc3f63876d06b603d70cd8a1be1734108f1312711abc8b6ce9f47eedf3e14dfe53424cf1c92e3bdba19efbcc364ebda7df80949ae649926a12a4c4966aa31ddc759e20992a8b2f64f9aa1a0e33d1f48f37c6ca524111f57ca6a4f30389ef002339b8c6330d9ef4bba1a026297b7bb1c3d70e50134c1957390b5acabceace26537138c40356d6ba4b2f49c5546c75c2db8333ec5d8a08937735fac039002a419b76a65b539572eac092538b2a6fed494dc212daafa498daf749d31'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import string
import re
import matplotlib.pyplot as plt

In [None]:
def read_text(filename):
    # open the file
    file = open(filename, mode='rt', encoding='utf-8')

    # read all text
    text = file.read()
    file.close()
    return text


def to_lines(text):
    sents = text.strip().split('\n')
    sents = [i.split('\t') for i in sents]
    return sents

In [None]:
data = read_text("../input/bilingual-sentence-pairs/deu.txt")
deu_eng = to_lines(data)
deu_eng = np.array(deu_eng)

print(deu_eng[:10])

In [None]:
# Remove punctuation
deu_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in deu_eng[:,0]]
deu_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in deu_eng[:,1]]

for i in range(len(deu_eng)):
    deu_eng[i,0] = deu_eng[i,0].lower()
    deu_eng[i,1] = deu_eng[i,1].lower()

eng_l = []
deu_l = []

# populate the lists with sentence lengths
for i in deu_eng[:,0]:
      eng_l.append(len(i.split()))

for i in deu_eng[:,1]:
      deu_l.append(len(i.split()))

length_df = pd.DataFrame({'eng':eng_l, 'deu':deu_l})

length_df.hist(bins = 30)
plt.show()

In [None]:
deu_eng[:10]

In [None]:
!pip install torchtext spacy
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
from torchtext.data.metrics import bleu_score
from torch.utils.data import DataLoader, Dataset, random_split

In [None]:
def tokenize_en(sentence):
    return sentence.lower().split()

def tokenize_fr(sentence):
    return sentence.lower().split()

def yield_tokens(data, tokenizer):
    for sentence in data:
        yield tokenizer(sentence)

tokenizer_en = get_tokenizer(tokenize_en)
tokenizer_fr = get_tokenizer(tokenize_fr)

vocab_en = build_vocab_from_iterator(yield_tokens(deu_eng[:, 0], tokenizer_en), specials=['<unk>', '<pad>', '<sos>', '<eos>'])
vocab_fr = build_vocab_from_iterator(yield_tokens(deu_eng[:, 1], tokenizer_fr), specials=['<unk>', '<pad>', '<sos>', '<eos>'])

vocab_en.set_default_index(vocab_en['<unk>'])
vocab_fr.set_default_index(vocab_fr['<unk>'])

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, english_sentences, french_sentences, vocab_en, vocab_fr, tokenizer_en, tokenizer_fr):
        self.english_sentences = english_sentences
        self.french_sentences = french_sentences
        self.vocab_en = vocab_en
        self.vocab_fr = vocab_fr
        self.tokenizer_en = tokenizer_en
        self.tokenizer_fr = tokenizer_fr

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        src = [self.vocab_en['<sos>']] + [self.vocab_en[token] for token in self.tokenizer_en(self.english_sentences[idx])] + [self.vocab_en['<eos>']]
        trg = [self.vocab_fr['<sos>']] + [self.vocab_fr[token] for token in self.tokenizer_fr(self.french_sentences[idx])] + [self.vocab_fr['<eos>']]
        return torch.tensor(src), torch.tensor(trg)

In [None]:
def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=vocab_en['<pad>'])
    trg_batch = pad_sequence(trg_batch, padding_value=vocab_fr['<pad>'])
    return src_batch, trg_batch

dataset = TranslationDataset(deu_eng[:, 0], deu_eng[:, 1], vocab_en, vocab_fr, tokenizer_en, tokenizer_fr)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn, shuffle=True)

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell


In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len = trg.shape[0]
        batch_size = trg.shape[1]
        trg_vocab_size = self.decoder.fc_out.out_features

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        input = trg[0,:]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1

        return outputs

In [None]:
INPUT_DIM = len(vocab_en)
OUTPUT_DIM = len(vocab_fr)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

PAD_IDX = vocab_fr['<pad>']

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
print(model)

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0

    for i, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()

        output = model(src, trg)

        output_dim = output.shape[-1]

        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

N_EPOCHS = 30
train_losses = []
CLIP = 1

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, criterion, CLIP)
    train_losses.append(train_loss)
    print(f'Epoch: {epoch+1}, Train Loss: {train_loss:.3f}')

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(1, N_EPOCHS + 1), train_losses, marker='o', label='Train Loss')
plt.title('Train Loss vs. Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
torch.save(model, 'seq2seq.pth')

In [None]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, 0)  # Turn off teacher forcing
            output_dim = output.shape[-1]

            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def calculate_bleu(iterator, model, vocab_fr):
    trgs = []
    pred_trgs = []

    model.eval()

    with torch.no_grad():
        for (src, trg) in iterator:
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, 0)  # Turn off teacher forcing
            output = output.argmax(2).cpu().numpy()

            for i in range(output.shape[1]):
                pred_trg = [vocab_fr.lookup_token(idx) for idx in output[:, i] if idx != vocab_fr['<pad>']]
                trg_sentence = [vocab_fr.lookup_token(idx) for idx in trg[:, i].cpu().numpy() if idx != vocab_fr['<pad>']]

                pred_trgs.append(pred_trg[1:-1])  # Remove <sos> and <eos>
                trgs.append([trg_sentence[1:-1]])  # Remove <sos> and <eos>

    return bleu_score(pred_trgs, trgs)

def test(model, iterator, criterion, vocab_fr):
    test_loss = evaluate(model, iterator, criterion)
    bleu = calculate_bleu(iterator, model, vocab_fr)

    print(f'Loss: {test_loss:.3f} | BLEU: {bleu:.2f}')


print("Train Evaluation: ")
test(model, train_dataloader, criterion, vocab_fr)
print("Test Evaluation: ")
test(model, test_dataloader, criterion, vocab_fr)