In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install Keras-Preprocessing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Keras-Preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Keras-Preprocessing
Successfully installed Keras-Preprocessing-1.1.2


In [None]:
import tensorflow as tf
tf.config.run_functions_eagerly(True)
import pathlib
import numpy as np
import re
import gensim
import typing
from typing import Any, Tuple
import os
import pickle

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker


from keras_preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors

In [None]:
# Download the file


path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [None]:
print(path_to_file)

/root/.keras/datasets/spa-eng/spa.txt


In [None]:
def load_data(path):
  text = path.read_text(encoding='utf-8')

  lines = text.splitlines()
  pairs = [line.split('\t') for line in lines]

  context = [context for target, context in pairs]
  target = [target for target, context in pairs]

  return target, context,lines

In [None]:
target_raw, context_raw,lines = load_data(path_to_file)


In [None]:
np.shape(context_raw)

(118964,)

In [None]:
#Lowercase
for i in range(len(target_raw)):
  target_raw[i]=target_raw[i].lower()
  target_raw[i] = target_raw[i].replace(".", "")
  target_raw[i] = target_raw[i].replace(",", "")
  target_raw[i] = re.sub(r'\d+', '', target_raw[i])
  target_raw[i]=target_raw[i].split()
  target_raw[i].insert(0, '<sos>')
  target_raw[i].insert(len(target_raw[i]), '<eos>')


  context_raw[i]=context_raw[i].lower()
  context_raw[i] = context_raw[i].replace(".", "")
  context_raw[i] = context_raw[i].replace(",", "")
  context_raw[i] = re.sub(r'\d+', '', context_raw[i])
  context_raw[i]=context_raw[i].split()
  context_raw[i].insert(0, '<sos>')
  context_raw[i].insert(len(context_raw[i]), '<eos>')

In [None]:
print(target_raw[54325])

['<sos>', 'they', 'died', 'one', 'after', 'another', '<eos>']


In [None]:
#Contraction
with open("/content/drive/MyDrive/contraction_expansion.txt", 'rb') as fp:
    contractions= pickle.load(fp)

In [None]:
print(target_raw[0])
len(target_raw)

['<sos>', 'go', '<eos>']


118964

In [None]:
#Expand Contraction
for i in range(len(target_raw)):
  for j in range(len(target_raw[i])):
    target=contractions.get(target_raw[i][j],None)
    if(target!=None):
      target_raw[i][j]=target



In [None]:
len(target_raw)

118964

In [None]:
print(lines[1])
print(context_raw[8688])

Go.	Vete.
['<sos>', 'mary', 'está', 'radiante', '<eos>']


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

# Fit the tokenizer on your text data
tokenizer.fit_on_texts(target_raw)

# Convert the text to a sequence of integers
#context_seq = tokenizer.texts_to_sequences(context_raw)
target_seq = tokenizer.texts_to_sequences(target_raw)



In [None]:
target_seq[0]

[1, 42, 2]

In [None]:
def tokenize_sent(text):
  '''
  Take list on texts as input and
  returns its tokenizer and enocoded text
  '''
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(text)

  return tokenizer, tokenizer.texts_to_sequences(text)


In [None]:
# Tokenize english and marathi sentences
eng_tokenizer, eng_encoded= tokenize_sent(text= target_raw)
spanish_tokenizer, spanish_encoded= tokenize_sent(text= context_raw)

In [None]:

# English Word --> index dictionary
eng_index_word = eng_tokenizer.index_word

# English Index --> word dictionary
eng_word_index= eng_tokenizer.word_index

# size of English vocabulary for encoder input
ENG_VOCAB_SIZE = len(eng_tokenizer.word_counts)+1

# Spanish Word --> index dict
spanish_word_index= spanish_tokenizer.word_index

# Spanish Index --> word dict
spanish_index_word = spanish_tokenizer.index_word
# Spanish vocab size for decoder output
SPA_VOCAB_SIZE=len(spanish_tokenizer.word_counts)+1

# Getting max length of English and Spanish sentences
max_eng_len = 0
for i in range(len(eng_encoded)):
  if len(eng_encoded[i]) > max_eng_len:
    max_eng_len= len(eng_encoded[i])

max_spa_len = 0
for i in range(len(spanish_encoded)):
  if len(eng_encoded[i]) > max_spa_len:
    max_spa_len= len(spanish_encoded[i])


# Padding both
eng_padded = pad_sequences(eng_encoded, maxlen=max_eng_len, padding='post')
spa_padded = pad_sequences(spanish_encoded, maxlen=max_spa_len, padding='post')

# Convert to array
eng_padded= np.array(eng_padded)
spa_padded= np.array(spa_padded)

# Split data into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(spa_padded, eng_padded, test_size=0.1, random_state=0)


In [None]:
X_train

array([[   1,   23, 2480, ...,    0,    0,    0],
       [   1,   24,   42, ...,    0,    0,    0],
       [   1,   13, 1407, ...,    0,    0,    0],
       ...,
       [   1, 4683,    9, ...,    0,    0,    0],
       [   1,  881,  613, ...,    0,    0,    0],
       [   1,   15,    6, ...,    0,    0,    0]], dtype=int32)

In [None]:
path="/content/drive/MyDrive/GoogleNews-vectors-negative300.bin"

In [None]:
word2vec = KeyedVectors.load_word2vec_format(path, binary=True)

In [None]:
EMBEDDING_SIZE = 300
VOCABULARY_SIZE = len(eng_tokenizer.word_index) + 1# create an empty embedding matix
english_embedding_weights = np.zeros((VOCABULARY_SIZE, EMBEDDING_SIZE))# create a word to index dictionary mapping
word2id = eng_tokenizer.word_index# copy vectors from word2vec model to the words present in corpus
for word, index in word2id.items():
  try:
    english_embedding_weights[index, :] = word2vec[word]
  except KeyError:
    pass
# check embedding dimension
print("Embeddings shape: {}".format(english_embedding_weights.shape))

Embeddings shape: (16546, 300)


In [None]:
VOCABULARY_SIZE_ENGLISH = len(eng_tokenizer.word_index) + 1

In [None]:
glove = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/SBW-vectors-300-min5.txt')

In [None]:
EMBEDDING_SIZE = 300
VOCABULARY_SIZE = len(spanish_tokenizer.word_index) + 1# create an empty embedding matix
spanish_embedding_weights = np.zeros((VOCABULARY_SIZE, EMBEDDING_SIZE))# create a word to index dictionary mapping
word2id = spanish_tokenizer.word_index# copy vectors from word2vec model to the words present in corpus
for word, index in word2id.items():
  try:
    spanish_embedding_weights[index, :] = glove[word]
  except KeyError:
    pass
# check embedding dimension
print("Embeddings shape: {}".format(spanish_embedding_weights.shape))

Embeddings shape: (31034, 300)


In [None]:
VOCABULARY_SIZE_SPANISH = len(spanish_tokenizer.word_index) + 1

In [None]:
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Concatenate
from tensorflow.keras import Input, Model

# Encoder input
encoder_inputs = Input(shape=(max_spa_len,))

print(encoder_inputs.shape)

# Embedding layer
enc_emb = Embedding(VOCABULARY_SIZE_SPANISH, EMBEDDING_SIZE, weights=[spanish_embedding_weights])(encoder_inputs)

print(enc_emb.shape)

# Bidirectional lstm layer
enc_lstm1 = Bidirectional(LSTM(256,return_sequences=True,return_state=True))


encoder_outputs1, forw_state_h, forw_state_c, back_state_h, back_state_c = enc_lstm1(enc_emb)


print(forw_state_h)
print(forw_state_c)
print(encoder_outputs1)

# Concatenate both h and c
final_enc_h = Concatenate()([forw_state_h,back_state_h])
print(np.shape(final_enc_h))
final_enc_c = Concatenate()([forw_state_c,back_state_c])

# get Context vector
encoder_states =[final_enc_h, final_enc_c]

print(encoder_states)

(None, 51)
(None, 51, 300)
KerasTensor(type_spec=TensorSpec(shape=(None, 256), dtype=tf.float32, name=None), name='bidirectional/forward_lstm/PartitionedCall:2', description="created by layer 'bidirectional'")
KerasTensor(type_spec=TensorSpec(shape=(None, 256), dtype=tf.float32, name=None), name='bidirectional/forward_lstm/PartitionedCall:3', description="created by layer 'bidirectional'")
KerasTensor(type_spec=TensorSpec(shape=(None, 51, 512), dtype=tf.float32, name=None), name='bidirectional/concat:0', description="created by layer 'bidirectional'")
(None, 512)
[<KerasTensor: shape=(None, 512) dtype=float32 (created by layer 'concatenate')>, <KerasTensor: shape=(None, 512) dtype=float32 (created by layer 'concatenate_1')>]


In [None]:
print(enc_emb.shape)

(None, 51, 300)


In [None]:
max_spa_len

51

In [None]:
#call attention using:
from tensorflow.keras.layers import Attention




In [None]:
#  decoder input
decoder_inputs = Input(shape=(None,))

# decoder embedding with same number as encoder embedding
dec_emb_layer = Embedding(VOCABULARY_SIZE_ENGLISH, 300,weights=[english_embedding_weights])
dec_emb = dec_emb_layer(decoder_inputs)   # apply this way because we need embedding layer for prediction

# In encoder we used Bidirectional so it's having two LSTM's so we have to take double units(256*2=512) for single decoder lstm
# LSTM using encoder's final states as initial state
decoder_lstm = LSTM(512, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

print(encoder_outputs1.shape)
print(decoder_outputs.shape)



(None, 51, 512)
(None, None, 512)


In [None]:
print(decoder_outputs.shape)

(None, None, 16546)


In [None]:
# Using Attention Layer
attention_layer = Attention()

#Modify your code and provide decoder_outputs first and encoder_outputs next as parameters.
attention_result = Attention(use_scale=True)([decoder_outputs, encoder_outputs1])

# Concat attention output and decoder LSTM output
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attention_result])

# Dense layer with softmax
decoder_dense = Dense(VOCABULARY_SIZE_ENGLISH, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)


# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
attention_result.shape

TensorShape([None, None, 512])

In [None]:
model.summary()

Model: "model_13"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 51)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 51, 300)      9310200     ['input_1[0][0]']                
                                                                                                  
 input_24 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 bidirectional (Bidirectional)  [(None, 51, 512),    1140736     ['embedding[0][0]']              
                                 (None, 256),                                              

In [None]:
# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Training
encoder_input_data = X_train
# To make same as target data skip last number which is just padding
decoder_input_data = y_train[:,:-1]
# Decoder target data has to be one step ahead so we are taking from 1 as told in keras docs
decoder_target_data =  y_train[:,1:]

# Testing
encoder_input_test = X_test
decoder_input_test = y_test[:,:-1]
decoder_target_test=  y_test[:,1:]

In [None]:
EPOCHS= 1

In [None]:
history = model.fit([encoder_input_data, decoder_input_data],decoder_target_data,
                    epochs=EPOCHS,
                    batch_size=128,
                    validation_data = ([encoder_input_test, decoder_input_test],decoder_target_test),
                    )



In [None]:
model.save_weights("/content/drive/MyDrive/NLP/achaModel.h5")

In [None]:
model.load_weights("/content/drive/MyDrive/NLP/achaModel.h5")

In [None]:
encoder_model = Model(encoder_inputs, outputs = [encoder_outputs1, final_enc_h, final_enc_c])

decoder_state_h = Input(shape=(512,))
decoder_state_c = Input(shape=(512,))
decoder_hidden_state_input = Input(shape=(max_spa_len,512))

dec_states = [decoder_state_h, decoder_state_c]

dec_emb2 = dec_emb_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=dec_states)

print(decoder_outputs2.shape)
print(decoder_hidden_state_input.shape)

# Attention inference
attention_result_inf= Attention(use_scale=True)([decoder_outputs2,decoder_hidden_state_input])

# attention_result_inf = tf.reduce_sum(attention_result_inf, axis=1, keepdims=True)
print(attention_result_inf.shape)

decoder_concat_input_inf = Concatenate(axis=-1, name='concat_layer')([decoder_outputs2, attention_result_inf])

dec_states2= [state_h2, state_c2]

decoder_outputs2 = decoder_dense(decoder_concat_input_inf)

decoder_model= Model(
                    [decoder_inputs] + [decoder_hidden_state_input, decoder_state_h, decoder_state_c],
                     [decoder_outputs2]+ dec_states2)

(None, None, 512)
(None, 51, 512)
(None, None, 512)


In [None]:
encoder_outputs1.shape
decoder_hidden_state_input.shape
attention_result_inf.shape

TensorShape([None, None, 512])

In [None]:


decoder_model.summary()

Model: "model_10"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, None, 300)    4963800     ['input_2[0][0]']                
                                                                                                  
 input_18 (InputLayer)          [(None, 512)]        0           []                               
                                                                                                  
 input_19 (InputLayer)          [(None, 512)]        0           []                               
                                                                                           

In [None]:
def get_predicted_sentence(input_seq):
    # Encode the input as state vectors.
    enc_output, enc_h, enc_c = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))

    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = eng_word_index['<sos>']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [enc_output, enc_h, enc_c ])
        #|print(output_tokens)
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        if sampled_token_index == 0:
          break
        else:
            # convert max index number to marathi word
            sampled_char = eng_index_word[sampled_token_index]

        if (sampled_char!='<eos>'):
            # aapend it ti decoded sent
            decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length or find stop token.
        if (sampled_char == '<eos>' or len(decoded_sentence.split()) >= 10):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        enc_h, enc_c = h, c

    return decoded_sentence

In [None]:
eng_word_index['<eos>']

2

In [None]:
def get_spanish_sentence(input_sequence):
    sentence =''
    for i in input_sequence:
      if i!=0 :
        sentence =sentence +eng_index_word[i]+' '
    return sentence

def get_english_sentence(input_sequence):
    sentence =''
    for i in input_sequence:
      if i!=0:
        sentence =sentence +spanish_index_word[i]+' '
    return sentence

In [None]:
eng_word_index['<eos>']

2

In [None]:
i=100
print("Spanish Sentence:",get_english_sentence(X_test[i]))
print("Actual English Sentence:",get_spanish_sentence(y_test[i]))
# Before passing input it has to be reshape as following
print("Predicted English Translation:",get_predicted_sentence(X_test[i].reshape(1,51)))
print("----------------------------------------------------------------------------------------")

Spanish Sentence: <sos> ¿cómo se puede parar esto? <eos> 
Actual English Sentence: <sos> how can this be stopped? <eos> 




Predicted English Translation:  how can stop this?
----------------------------------------------------------------------------------------


In [None]:
spanish_index_word[14291]

'ovni?'

In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0.0,>=0.11.0


In [None]:
from datasets import load_metric
bleu=load_metric("bleu")

  bleu=load_metric("bleu")


Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

In [None]:
("<sos> you guys are wrong <eos>").split(" ")

['<sos>', 'you', 'guys', 'are', 'wrong', '<eos>']

In [None]:
actual=[["<sos>", "how", "can", "this", "be","stopped","?", "<eos>"]]

In [None]:
prediction=["how","can","stop","this"]

In [None]:
# prediction=[("you are wrong").split(" ")]
# actual=[("<sos> you guys are wrong <eos>").split(" ")]
bleu.compute(predictions=[prediction],references=[actual],max_order=4,smooth=True)

{'bleu': 0.18693159143202892,
 'precisions': [0.8, 0.5, 0.3333333333333333, 0.5],
 'brevity_penalty': 0.36787944117144233,
 'length_ratio': 0.5,
 'translation_length': 4,
 'reference_length': 8}