To-Do
- [ ] Create Separate Functions for each task
- [ ] Evaluation of our model
- [ ] How to generalize the model?
- [ ] More about preprocessing
- [ ] Improve Handling of rare words

In [None]:
!pip install keras-preprocessing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.6 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 KB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras-preprocessing
Successfully installed keras-preprocessing-1.1.2


In [None]:
!pip install -U tensorflow-text==2.11.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-text==2.11.0
  Downloading tensorflow_text-2.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.11.0


In [None]:
import numpy as np
import pathlib
from keras.utils import to_categorical
import pickle
import tensorflow as tf
import tensorflow_text as tf_text
from keras.models import Model, Sequential
from keras.layers import Input, LSTM, Dense, Embedding, SimpleRNN, RepeatVector, TimeDistributed

In [None]:
tf.__version__

'2.11.0'

## English to Spanish

In [None]:
import pathlib

path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [None]:
def load_data(path):
  text = path.read_text(encoding='utf-8')

  lines = text.splitlines()
  pairs = [line.split('\t') for line in lines]

  context = np.array([context for target, context in pairs])
  target = np.array([target for target, context in pairs])

  return target, context

data = load_data(path_to_file)

In [None]:
inputs = np.array(data[0][:10000])
outputs = np.array(data[1][:10000])

In [None]:
# vectorizer = tf.keras.layers.TextVectorization(standardize="lower_and_strip_punctuation", output_sequence_length=src)
# text_dataset = tf.data.Dataset.from_tensor_slices(inputs)
# vectorizer.adapt(text_dataset)

In [None]:
# # dump(vectorizer,open('drive/MyDrive/Machine Translation/vectroizer.pkl','wb'))
# import pickle
# pickle.dump({'config': vectorizer.get_config(),
#              'weights': vectorizer.get_weights()}
#             , open("drive/MyDrive/Machine Translation/vectorizer.pkl", "wb"))

In [None]:

# saved = pickle.load(open('drive/MyDrive/Machine Translation/vectorizer.pkl','rb'))
# vectorizer = tf.keras.layers.TextVectorization.from_config(saved['config'])

# vectorizer.adapt(tf.data.Dataset.from_tensor_slices(['random']))
# vectorizer.set_weights(saved['weights'])

In [None]:
# spanish_vectorizer = tf.keras.layers.TextVectorization(standardize="lower_and_strip_punctuation", output_sequence_length=tar)
# text_dataset = tf.data.Dataset.from_tensor_slices(outputs)
# spanish_vectorizer.adapt(text_dataset)

In [None]:
# import pickle
# pickle.dump({'config': spanish_vectorizer.get_config(),
#              'weights': spanish_vectorizer.get_weights()}
#             , open("drive/MyDrive/Machine Translation/spanish_vectorizer.pkl", "wb"))

In [None]:
# saved = pickle.load(open('drive/MyDrive/Machine Translation/spanish_vectorizer.pkl','rb'))
# spanish_vectorizer = tf.keras.layers.TextVectorization.from_config(saved['config'])

# spanish_vectorizer.adapt(tf.data.Dataset.from_tensor_slices(['random']))
# spanish_vectorizer.set_weights(saved['weights'])

In [None]:
# train_X = vectorizer(inputs[indices])
# train_Y = spanish_vectorizer(outputs[indices])
# # model.fit(train_X, train_Y)

In [None]:
def create_tokenizer(source, target, num_words=5000, lower=True):
  src_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words,lower=lower)
  tar_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words,lower=lower)

  src_tokenizer.fit_on_texts(source)
  tar_tokenizer.fit_on_texts(target)
  return src_tokenizer, tar_tokenizer

In [None]:
eng_tokenizer,spa_tokenizer = create_tokenizer(inputs, outputs)

In [None]:
def transform_data(source, target):
  source = eng_tokenizer.texts_to_sequences(inputs)
  source = tf.keras.preprocessing.sequence.pad_sequences(source, maxlen=8, padding='post')

  target = spa_tokenizer.texts_to_sequences(outputs)
  target = tf.keras.preprocessing.sequence.pad_sequences(target, maxlen=8, padding='post')
  return source, target

def split_data(source, target, train_split=0.8):
  size = inputs.shape[0]
  indices = np.random.randint(0, size, size=(int(size*train_split),))

  trainX = source[indices]
  trainY = target[indices]

  mask = np.ones(size,bool)
  mask[indices] = False
  testX = source[mask]
  testY = target[mask]
  return trainX, trainY, testX, testY

source, target = transform_data(inputs, outputs)
trainX, trainY, testX, testY = split_data(source, target, 0.8)

In [None]:
def encode_output(sequences, vocab_size):
 ylist = list()
 for sequence in sequences:
  encoded = tf.keras.utils.to_categorical(sequence, num_classes=vocab_size)
  ylist.append(encoded)
 y = np.array(ylist)
 y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
 return y

trainY = encode_output(trainY, len(spa_tokenizer.word_index)+1)

In [None]:
tar_vocab = len(spa_tokenizer.word_index)+1
src_vocab = len(eng_tokenizer.word_index)+1
src = max(len(line.split()) for line in inputs)
tar = max(len(line.split()) for line in outputs)

In [None]:
layer1 = Embedding(1000, 16, input_length=8)
layer2 = LSTM(256)
model = Sequential()
model.add(layer1)
model.add(layer2)

# we want to replicate the context vector for each time step
model.add(RepeatVector(8))
model.add(LSTM(256, return_sequences=True))

# converting decoder output to our desired sequence format
model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 16)             16000     
                                                                 
 lstm (LSTM)                 (None, 256)               279552    
                                                                 
 repeat_vector (RepeatVector  (None, 8, 256)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 8, 256)            525312    
                                                                 
 time_distributed (TimeDistr  (None, 8, 4961)          1274977   
 ibuted)                                                         
                                                                 
Total params: 2,095,841
Trainable params: 2,095,841
Non-

In [None]:
model.fit(trainX, trainY, epochs=30, batch_size=64)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fc6b78e8dc0>

In [None]:
def inference(model, input, source_tokenizer, tar_token_to_word):
  """
  Args:
  input - a string in the source language
  """
  # tokenized = source_tokenizer.texts_to_sequences(input)
  # test = tf.keras.preprocessing.sequence.pad_sequences(tokenized, maxlen=8, padding='post')
  prediction = model.predict(input)

  output = [np.argmax(vector) for vector in prediction[0]]

  output_list = []
  for i in output:
    if i == 0:
      break
    else:
      output_list.append(tar_token_to_word[i])

  output_sentence = ' '.join(output_list)
  return output_sentence

eng_word_to_token = eng_tokenizer.word_index
eng_token_to_word = {token:word for word, token in eng_word_to_token.items()}

spa_word_to_token = spa_tokenizer.word_index
spa_token_to_word = {token:word for word, token in spa_word_to_token.items()}

In [None]:
inference(model, trainX[1:3], eng_tokenizer, spa_token_to_word)



'me llamé'

## Eng to Hin (Incomplete)

In [None]:
with open('hin.txt') as f:
    lines = f.readlines()

In [None]:
len(lines)

2909

In [None]:
def prepare_data(lines):
  inputs = []
  outputs = []

  for i in range(len(lines)):
    src, target, _ = lines[i].split("\t")
    inputs.append(src)
    outputs.append(target)
  
  inputs = np.array(inputs)
  outputs = np.array(outputs)
  return inputs, outputs

inputs, outputs = prepare_data(lines)

In [None]:
inputs.shape, outputs.shape

((2909,), (2909,))

In [None]:
eng_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000,lower=True)
hin_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000,lower=True)

eng_tokenizer.fit_on_texts(inputs)
hin_tokenizer.fit_on_texts(outputs)

trainX = eng_tokenizer.texts_to_sequences(inputs)
trainX = tf.keras.preprocessing.sequence.pad_sequences(trainX, maxlen=8, padding='post')

trainY = hin_tokenizer.texts_to_sequences(outputs)
trainY = tf.keras.preprocessing.sequence.pad_sequences(trainY, maxlen=8, padding='post')

def encode_output(sequences, vocab_size):
 ylist = list()
 for sequence in sequences:
  encoded = tf.keras.utils.to_categorical(sequence, num_classes=vocab_size)
  ylist.append(encoded)
 y = np.array(ylist)
 y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
 return y

trainY = encode_output(trainY[:9000], len(hin_tokenizer.word_index)+1)

In [None]:
len(hin_tokenizer.word_index)

3012

In [None]:
tar_vocab = len(hin_tokenizer.word_index)+1
src_vocab = len(eng_tokenizer.word_index)+1
src = max(len(line.split()) for line in inputs)
tar = max(len(line.split()) for line in outputs)

In [None]:
layer1 = Embedding(src_vocab, 16, input_length=8)
layer2 = LSTM(256)
model = Sequential()
model.add(layer1)
model.add(layer2)

# we want to replicate the context vector for each time step
model.add(RepeatVector(8))
model.add(LSTM(256, return_sequences=True))

# converting decoder output to our desired sequence format
model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 16)             38336     
                                                                 
 lstm (LSTM)                 (None, 256)               279552    
                                                                 
 repeat_vector (RepeatVector  (None, 8, 256)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 8, 256)            525312    
                                                                 
 time_distributed (TimeDistr  (None, 8, 3013)          774341    
 ibuted)                                                         
                                                                 
Total params: 1,617,541
Trainable params: 1,617,541
Non-

In [None]:
trainX.shape

(2909, 8)

In [None]:
!pip install tensorflow-addons==0.16.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-addons==0.16.1
  Downloading tensorflow_addons-0.16.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.16.1


In [None]:
import tensorflow_addons as tfa

 The versions of TensorFlow you are currently using is 2.11.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [None]:
bi_model = Sequential()
bi_model.add(Embedding(src_vocab, 16, input_length=8))
bi_model.add(tf.keras.layers.Bidirectional(LSTM(256, return_sequences=True), input_shape=(8,16)))
bi_model.add(tf.keras.layers.Attention(256))
bi_model.add(LSTM(256, return_sequences=True))
bi_model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
bi_model.compile(optimizer='adam', loss='categorical_crossentropy')
bi_model.summary()

ValueError: ignored

In [None]:
trainY.shape

(2909, 8, 3013)

In [None]:
bi_model.fit(trainX, trainY, epochs=250, batch_size=64)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

<keras.callbacks.History at 0x7f6462434130>

In [None]:
predictions = bi_model.predict(trainX)



In [None]:
predictions.shape

(2909, 8, 3013)

In [None]:
[np.argmax(vector) for vector in predictions[0]]

[775, 0, 0, 0, 0, 0, 0, 0]

In [None]:
model.fit(trainX, trainY, epochs=250, batch_size=64)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

<keras.callbacks.History at 0x7f37283a7c40>

In [None]:
input = trainX[121:528]

In [None]:
# mapping tokens to words and vice-versa for both source and the target
eng_word_to_token = eng_tokenizer.word_index
eng_token_to_word = {token:word for word, token in eng_word_to_token.items()}

hin_word_to_token = hin_tokenizer.word_index
hin_token_to_word = {token:word for word, token in hin_word_to_token.items()}

In [None]:
def inference(model, input, source_tokenizer, tar_token_to_word):
  """
  Args:
  input - a string in the source language
  """
  tokenized = source_tokenizer.texts_to_sequences(test)
  test = tf.keras.preprocessing.sequence.pad_sequences(tokenized, maxlen=8, padding='post')
  prediction = model.predict(test)

  output = [np.argmax(vector) for vector in prediction[0]]

  output_list = []
  for i in output:
    if i == 0:
      break
    else:
      output_list.append(tar_token_to_word[i])

  output_sentence = ' '.join(output_list)
  return output_sentence