<a href="https://colab.research.google.com/github/SowjanyaKiran/Encoder_Decoder_Architecture/blob/main/Encoder_Decoder_Architecture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Please connect to v5e-1 TPU runtime

In [None]:
#Installing required libraries
!pip install tensorflow tensorflow_datasets tfds



In [None]:
#Importing important libraries
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input , LSTM , Embedding , Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow_datasets as tfds
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Loading dataset from Tnsorflow dataset
examples , metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
train_examples , val_examples = examples['train'] , examples['validation']

In [None]:
#We'll only use a small subset for demonstration
MAX_SAMPLES = 1000

In [None]:
#Function for preprocessing the data
def preprocess(data , num_samples = MAX_SAMPLES):
  src_texts = []
  tgt_texts = []

  for src , tgt in data.take(num_samples):
    src_texts.append(src.numpy().decode('utf-8'))
    tgt_texts.append(tgt.numpy().decode('utf-8'))
    tgt = '<start> ' + tgt + ' <end>'

  return src_texts , tgt_texts

In [None]:
#Driver code
input_texts , target_texts = preprocess(train_examples)

In [None]:
#Tokenization
def tokenize(sentences):
  tokenizer = Tokenizer(filters = '')
  tokenizer.fit_on_texts(sentences)
  tensor = tokenizer.texts_to_sequences(sentences)
  tensor = pad_sequences(tensor , padding = 'post' , maxlen = 20 , truncating = 'post')
  return tensor , tokenizer

In [None]:
#Drivers code
input_tensor , input_tokenizer = tokenize(input_texts)
target_tensor , target_tokenizer = tokenize(target_texts)

In [None]:
#Vocabulary sizes
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

In [None]:
#Sequences lengths
max_input_len = input_tensor.shape[1]
max_target_len = target_tensor.shape[1]

In [None]:
#Encoder - decoder with LSTM
embedding_dim = 256
units = 1024

In [None]:
#Encoder
encoder_inputs = Input(shape = (None,))
enc_emb = Embedding(input_vocab_size , embedding_dim , trainable = True)(encoder_inputs)
encoder_lstm = LSTM(units , return_state = True)
encoder_outputs , state_h , state_c = encoder_lstm(enc_emb)
encoder_states = [state_h , state_c]

In [None]:
#Dcoder
decoder_inputs = Input(shape = (None,))
dec_emb = Embedding(target_vocab_size , embedding_dim )(decoder_inputs)
decoder_lstm = LSTM(units , return_sequences = True , return_state = True)
decoder_outputs , _ , _ = decoder_lstm(dec_emb , initial_state = encoder_states)
decoder_dense = Dense(target_vocab_size , activation = 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
#Define the model
model = Model([encoder_inputs , decoder_inputs] , decoder_outputs)

In [None]:
#Compile the model
model.compile(optimizer = 'adam' , loss = 'sparse_categorical_crossentropy' , metrics = ['accuracy'])

In [None]:
#Prepare decoder target data by shifting
target_input = target_tensor[: , :-1]
target_output = target_tensor[: , 1:]
target_output = np.expand_dims(target_output , axis = -1)

In [None]:
#Train the model
model.fit([input_tensor , target_input] , target_output , batch_size = 32 , epochs = 3)

Epoch 1/3
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 4s/step - accuracy: 0.2970 - loss: 5.9672
Epoch 2/3
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 4s/step - accuracy: 0.3492 - loss: 4.2918
Epoch 3/3
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 4s/step - accuracy: 0.3644 - loss: 4.0581


<keras.src.callbacks.history.History at 0x796114dc67b0>

In [None]:
#Model summary
model.summary()