<a href="https://colab.research.google.com/github/SaiSuvamPatnaik/Pytorch_Seq2Seq-Model/blob/main/Pytorch_Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

In [3]:
spacy_de = spacy.load('de_core_news_sm')          # Load the German tokeniser
spacy_en = spacy.load('en_core_web_sm')           # Load the English tokeniser

In [4]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]         # Tokenizes German text from a string into a list of strings (tokens) and reverses it

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]               # Tokenizes English text from a string into a list of strings (tokens)

In [5]:
SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [6]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),  # exts specifies which languages to use as the source and target (source goes first) 
                                                    fields = (SRC, TRG))    # and fields specifies which field to use for the source and target.  

downloading training.tar.gz


100%|██████████| 1.21M/1.21M [00:04<00:00, 300kB/s]


downloading validation.tar.gz


100%|██████████| 46.3k/46.3k [00:00<00:00, 92.5kB/s]


downloading mmt_task1_test2016.tar.gz


100%|██████████| 66.2k/66.2k [00:00<00:00, 88.2kB/s]


In [7]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [8]:
print(vars(train_data.examples[0]))   #Printing 1 Test example

{'src': ['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [9]:
SRC.build_vocab(train_data, max_size=10000, min_freq = 2)   # Using the min_freq argument, we only allow tokens that appear at least 2 times to appear in our vocabulary. 
TRG.build_vocab(train_data, max_size=10000, min_freq = 2)   # Tokens that appear only once are converted into an <unk> (unknown) token.

In [10]:
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 7855
Unique tokens in target (en) vocabulary: 5893


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
device

device(type='cuda')

In [13]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)