In [3]:
!pip install keras-preprocessing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 KB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras-preprocessing
Successfully installed keras-preprocessing-1.1.2


In [4]:
!pip install -U tensorflow-text==2.11.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-text==2.11.0
  Downloading tensorflow_text-2.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.11.0


In [5]:
import numpy as np
import pathlib
from keras.utils import to_categorical
import pickle
import tensorflow as tf
import tensorflow_text as tf_text
from keras.models import Model, Sequential
from keras.layers import Input, LSTM, Dense, Embedding, SimpleRNN, RepeatVector, TimeDistributed

In [6]:
tf.__version__

'2.11.0'

In [7]:
import pathlib

path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [8]:
def load_data(path):
  text = path.read_text(encoding='utf-8')

  lines = text.splitlines()
  pairs = [line.split('\t') for line in lines]

  context = np.array([context for target, context in pairs])
  target = np.array([target for target, context in pairs])

  return target, context

data = load_data(path_to_file)

In [9]:
inputs = np.array(data[0][:10000])
outputs = np.array(data[1][:10000])

In [10]:
# vectorizer = tf.keras.layers.TextVectorization(standardize="lower_and_strip_punctuation", output_sequence_length=src)
# text_dataset = tf.data.Dataset.from_tensor_slices(inputs)
# vectorizer.adapt(text_dataset)

In [11]:
# # dump(vectorizer,open('drive/MyDrive/Machine Translation/vectroizer.pkl','wb'))
# import pickle
# pickle.dump({'config': vectorizer.get_config(),
#              'weights': vectorizer.get_weights()}
#             , open("drive/MyDrive/Machine Translation/vectorizer.pkl", "wb"))

In [12]:

# saved = pickle.load(open('drive/MyDrive/Machine Translation/vectorizer.pkl','rb'))
# vectorizer = tf.keras.layers.TextVectorization.from_config(saved['config'])

# vectorizer.adapt(tf.data.Dataset.from_tensor_slices(['random']))
# vectorizer.set_weights(saved['weights'])

In [13]:
# spanish_vectorizer = tf.keras.layers.TextVectorization(standardize="lower_and_strip_punctuation", output_sequence_length=tar)
# text_dataset = tf.data.Dataset.from_tensor_slices(outputs)
# spanish_vectorizer.adapt(text_dataset)

In [14]:
# import pickle
# pickle.dump({'config': spanish_vectorizer.get_config(),
#              'weights': spanish_vectorizer.get_weights()}
#             , open("drive/MyDrive/Machine Translation/spanish_vectorizer.pkl", "wb"))

In [15]:
# saved = pickle.load(open('drive/MyDrive/Machine Translation/spanish_vectorizer.pkl','rb'))
# spanish_vectorizer = tf.keras.layers.TextVectorization.from_config(saved['config'])

# spanish_vectorizer.adapt(tf.data.Dataset.from_tensor_slices(['random']))
# spanish_vectorizer.set_weights(saved['weights'])

In [16]:
size = inputs.shape[0]
indices = np.random.randint(0, size, size=(int(size*0.8),))

In [17]:
# train_X = vectorizer(inputs[indices])
# train_Y = spanish_vectorizer(outputs[indices])
# # model.fit(train_X, train_Y)

In [18]:
eng_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000,lower=True)
spa_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000,lower=True)

In [19]:
eng_tokenizer.fit_on_texts(inputs)
spa_tokenizer.fit_on_texts(outputs)

In [20]:
trainX = eng_tokenizer.texts_to_sequences(inputs[indices])
trainX = tf.keras.preprocessing.sequence.pad_sequences(trainX, maxlen=8, padding='post')

In [21]:
trainY = spa_tokenizer.texts_to_sequences(outputs[indices])
trainY = tf.keras.preprocessing.sequence.pad_sequences(trainY, maxlen=8, padding='post')

In [22]:
def encode_output(sequences, vocab_size):
 ylist = list()
 for sequence in sequences:
  encoded = tf.keras.utils.to_categorical(sequence, num_classes=vocab_size)
  ylist.append(encoded)
 y = np.array(ylist)
 y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
 return y

# spa_vocab = len(spanish_vectorizer.get_vocabulary())
trainY = encode_output(trainY[:9000], len(spa_tokenizer.word_index)+1)

In [24]:
tar_vocab = len(spa_tokenizer.word_index)+1
src_vocab = len(eng_tokenizer.word_index)+1
src = max(len(line.split()) for line in inputs)
tar = max(len(line.split()) for line in outputs)

In [25]:
layer1 = Embedding(1000, 16, input_length=8)
layer2 = LSTM(256)
model = Sequential()
model.add(layer1)
model.add(layer2)

# we want to replicate the context vector for each time step
model.add(RepeatVector(8))
model.add(LSTM(256, return_sequences=True))

# converting decoder output to our desired sequence format
model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 16)             16000     
                                                                 
 lstm (LSTM)                 (None, 256)               279552    
                                                                 
 repeat_vector (RepeatVector  (None, 8, 256)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 8, 256)            525312    
                                                                 
 time_distributed (TimeDistr  (None, 8, 4961)          1274977   
 ibuted)                                                         
                                                                 
Total params: 2,095,841
Trainable params: 2,095,841
Non-

In [33]:
trainY.shape

(8000, 8, 4961)

In [27]:
model.fit(trainX[:9000], trainY, epochs=30, batch_size=64)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f2db8726b20>

In [31]:
prediction = model.predict(trainX[15:16])



In [32]:
import numpy as np
[np.argmax(vector) for vector in prediction[0]]

[14, 112, 0, 0, 0, 0, 0, 0]