In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string
from string import digits
import re
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Input, Dense,Embedding, Concatenate, TimeDistributed
from tensorflow.keras.models import Model,load_model, model_from_json
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
import pickle as pkl
import numpy as np

In [3]:
with open('/content/drive/MyDrive/deu.txt','r') as f:
  transalte = f.read()

uncleaned_data_list = transalte.split('\n')
len(uncleaned_data_list)
uncleaned_data_list = uncleaned_data_list[:10000]
len(uncleaned_data_list)
english_word = []
german_word = []
cleaned_data_list = []
for word in uncleaned_data_list:
  english_word.append(word.split('\t')[:-1][0])
  german_word.append(word.split('\t')[:-1][1])
language_data = pd.DataFrame(columns=['English','German'])
language_data['English'] = english_word
language_data['German'] = german_word
language_data.to_csv('language_data.csv', index=False)

In [4]:
language_data.tail()

Unnamed: 0,English,German
9995,She has a bike.,Sie hat ein Rad.
9996,She has a book.,Sie hat ein Buch.
9997,She has a cold.,Sie ist erkältet.
9998,She has a cold.,Sie hat eine Erkältung.
9999,She has brains.,Sie ist ein gescheites Köpfchen.


# **Cleaning Data**

In [5]:
#lower case
language_data['English']=language_data['English'].str.lower()
language_data['German']=language_data['German'].str.lower()

#removing url
def remove_URL(text):
    import re
    text = re.sub(r'http\S+', '', str(text))
    return text
language_data['English'] = language_data['English'].apply(lambda x: remove_URL(x))
language_data['German'] = language_data['German'].apply(lambda x: remove_URL(x))

#remove punctutaions
def remove_punct(text):
    import re
    text = re.sub(r'[^\w\s]', '',str(text) )
    return text
language_data['English'] = language_data['English'].apply(lambda x: remove_punct(x))
language_data['German'] = language_data['German'].apply(lambda x: remove_punct(x))

#remove non-asci
def remove_non_ascii(text):
    import re
    text = re.sub(r'[^\x00-\x7F]', '',str(text) )
    return text
language_data['English'] = language_data['English'].apply(lambda x: remove_non_ascii(x))
language_data['German'] = language_data['German'].apply(lambda x: remove_non_ascii(x))

#remove html tags
def remove_html(text):
    import re
    text = re.sub('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', '',str(text) )
    return text
language_data['English'] = language_data['English'].apply(lambda x: remove_html(x))
language_data['German'] = language_data['German'].apply(lambda x: remove_html(x))

In [6]:
english_text = language_data['English'].values
german_text = language_data['German'].values
len(english_text), len(german_text)

(10000, 10000)

In [7]:
language_data.tail()

Unnamed: 0,English,German
9995,she has a bike,sie hat ein rad
9996,she has a book,sie hat ein buch
9997,she has a cold,sie ist erkltet
9998,she has a cold,sie hat eine erkltung
9999,she has brains,sie ist ein gescheites kpfchen


## **Split the data in test and train**

In [8]:
X = english_text
Y = german_text
X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.2)

# **Tokenization**

In [9]:
englishTokenizer = Tokenizer()
englishTokenizer.fit_on_texts(X_train)
Eword2index = englishTokenizer.word_index

vocab_size_source = len(Eword2index) + 1
X_train = englishTokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, padding='post')

X_test = englishTokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, padding='post')

germanTokenizer = Tokenizer()
germanTokenizer.fit_on_texts(y_train)
Gword2index = germanTokenizer.word_index

vocab_size_target = len(Gword2index) + 1
y_train = germanTokenizer.texts_to_sequences(y_train)

y_train = pad_sequences(y_train, padding='post')
y_test = germanTokenizer.texts_to_sequences(y_test)
y_test = pad_sequences(y_test, padding='post')
vocab_size_source, vocab_size_target
Eword2index = dict([(value, key) for key, value in Eword2index.items()])
Gword2index = dict([(value, key) for key, value in Gword2index.items()])
print(Eword2index)
print(Gword2index)

{1: 'i', 2: 'tom', 3: 'it', 4: 'you', 5: 'im', 6: 'is', 7: 'a', 8: 'me', 9: 'its', 10: 'was', 11: 'he', 12: 'we', 13: 'go', 14: 'do', 15: 'can', 16: 'to', 17: 'are', 18: 'that', 19: 'dont', 20: 'this', 21: 'ill', 22: 'come', 23: 'were', 24: 'youre', 25: 'get', 26: 'my', 27: 'have', 28: 'up', 29: 'be', 30: 'the', 31: 'here', 32: 'like', 33: 'she', 34: 'in', 35: 'not', 36: 'who', 37: 'toms', 38: 'on', 39: 'love', 40: 'him', 41: 'stop', 42: 'did', 43: 'now', 44: 'they', 45: 'keep', 46: 'no', 47: 'take', 48: 'us', 49: 'need', 50: 'home', 51: 'am', 52: 'let', 53: 'one', 54: 'how', 55: 'hes', 56: 'lets', 57: 'well', 58: 'thats', 59: 'lost', 60: 'know', 61: 'saw', 62: 'try', 63: 'away', 64: 'help', 65: 'want', 66: 'see', 67: 'too', 68: 'just', 69: 'got', 70: 'so', 71: 'there', 72: 'may', 73: 'out', 74: 'has', 75: 'down', 76: 'please', 77: 'look', 78: 'theyre', 79: 'eat', 80: 'busy', 81: 'wait', 82: 'hate', 83: 'cant', 84: 'will', 85: 'back', 86: 'ok', 87: 'for', 88: 'stay', 89: 'give', 90: 'w

In [10]:
max_eng_len = 0
for i in range(len(X_train)):
  if len(X_train[i]) > max_eng_len:
    max_eng_len= len(X_train[i])

max_ger_len = 0
for i in range(len(y_train)):
  if len(y_train[i]) > max_ger_len:
    max_ger_len= len(y_train[i])

In [11]:
import tensorflow as tf
import os
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras import backend as K
 
 
class AttentionLayer(Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """
 
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)
 
    def build(self, input_shape):
        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.
 
        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)
 
        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end
 
    def call(self, inputs, verbose=False):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs
        if verbose:
            print('encoder_out_seq>', encoder_out_seq.shape)
            print('decoder_out_seq>', decoder_out_seq.shape)
 
        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state """
 
            assert_msg = "States must be a list. However states {} is of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg
 
            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]
 
            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch_size*en_seq_len, latent_dim
            reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden))
            # <= batch_size*en_seq_len, latent_dim
            W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden))
            if verbose:
                print('wa.s>',W_a_dot_s.shape)
 
            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim
            if verbose:
                print('Ua.h>',U_a_dot_h.shape)
 
            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            reshaped_Ws_plus_Uh = K.tanh(K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden)))
            if verbose:
                print('Ws+Uh>', reshaped_Ws_plus_Uh.shape)
 
            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len))
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)
 
            if verbose:
                print('ei>', e_i.shape)
 
            return e_i, [e_i]
 
        def context_step(inputs, states):
            """ Step function for computing ci using ei """
            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            if verbose:
                print('ci>', c_i.shape)
            return c_i, [c_i]
 
        def create_inital_state(inputs, hidden_size):
            # We are not using initial states, but need to pass something to K.rnn funciton
            fake_state = K.zeros_like(inputs)  # <= (batch_size, enc_seq_len, latent_dim
            fake_state = K.sum(fake_state, axis=[1, 2])  # <= (batch_size)
            fake_state = K.expand_dims(fake_state)  # <= (batch_size, 1)
            fake_state = K.tile(fake_state, [1, hidden_size])  # <= (batch_size, latent_dim
            return fake_state
 
        fake_state_c = create_inital_state(encoder_out_seq, encoder_out_seq.shape[-1])
        fake_state_e = create_inital_state(encoder_out_seq, encoder_out_seq.shape[1])  # <= (batch_size, enc_seq_len, latent_dim
 
        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )
 
        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )
 
        return c_outputs, e_outputs
 
    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

In [12]:
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Concatenate
from tensorflow.keras import Input, Model

# Encoder input
encoder_inputs = Input(shape=(max_eng_len,)) 

# Embedding layer- i am using 1024 output-dim for embedding you can try diff values 100,256,512,1000
enc_emb = Embedding(vocab_size_source, 1024)(encoder_inputs)

# Bidirectional lstm layer
enc_lstm1 = Bidirectional(LSTM(256,return_sequences=True,return_state=True))
encoder_outputs1, forw_state_h, forw_state_c, back_state_h, back_state_c = enc_lstm1(enc_emb)

# Concatenate both h and c 
final_enc_h = Concatenate()([forw_state_h,back_state_h])
final_enc_c = Concatenate()([forw_state_c,back_state_c])

# get Context vector
encoder_states =[final_enc_h, final_enc_c]

In [13]:
#  decoder input
decoder_inputs = Input(shape=(None,)) 

# decoder embedding with same number as encoder embedding
dec_emb_layer = Embedding(vocab_size_target, 1024) 
dec_emb = dec_emb_layer(decoder_inputs)   # apply this way because we need embedding layer for prediction 

# In encoder we used Bidirectional so it's having two LSTM's so we have to take double units(256*2=512) for single decoder lstm
# LSTM using encoder's final states as initial state
decoder_lstm = LSTM(512, return_sequences=True, return_state=True) 
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

"""# Using Attention Layer
attention_layer = AttentionLayer()
attention_result, attention_weights = attention_layer([encoder_outputs1, decoder_outputs])

# Concat attention output and decoder LSTM output 
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attention_result])
"""
# Dense layer with softmax
decoder_dense = Dense(vocab_size_target, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 

In [14]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 5)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 5, 1024)      2035712     ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 bidirectional (Bidirectional)  [(None, 5, 512),     2623488     ['embedding[0][0]']              
                                 (None, 256),                                                 

In [15]:
# compile model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Define callbacks
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
checkpoint = ModelCheckpoint("give Your path to save check points", monitor='val_accuracy')
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)
callbacks_list = [checkpoint, early_stopping]

# Training set
encoder_input_data = X_train
# To make same as target data skip last number which is just padding
decoder_input_data = y_train[:,:-1]
# Decoder target data has to be one step ahead so we are taking from 1 as told in keras docs
decoder_target_data =  y_train[:,1:]

# devlopment set
encoder_input_test = X_test
decoder_input_test = y_test[:,:-1]
decoder_target_test=  y_test[:,1:]

history = model.fit([encoder_input_data, decoder_input_data],decoder_target_data, 
                    epochs= 5, 
                    batch_size=128)

# Don't forget to save weights of trained model 
model.save_weights("model.h5") # can give whole path to save model

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [16]:
model.load_weights("model.h5")

# INFERENCE MODEL
# encoder Inference model
encoder_model = Model(encoder_inputs, outputs = [encoder_outputs1, final_enc_h, final_enc_c])

# Decoder Inference
decoder_state_h = Input(shape=(512,)) # This numbers has to be same as units of lstm's on which model is trained
decoder_state_c = Input(shape=(512,))

# we need hidden state for attention layer
# 36 is maximum length if english sentence It has to same as input taken by attention layer can see in model plot
decoder_hidden_state_input = Input(shape=(36,512)) 
# get decoder states
dec_states = [decoder_state_h, decoder_state_c]

# embedding layer 
dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=dec_states)

"""# Attention inference
attention_result_inf, attention_weights_inf = attention_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_concat_input_inf = Concatenate(axis=-1, name='concat_layer')([decoder_outputs2, attention_result_inf])
"""
dec_states2= [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

# get decoder model
decoder_model= Model(
                    [decoder_inputs] + [decoder_hidden_state_input, decoder_state_h, decoder_state_c],
                     [decoder_outputs2]+ dec_states2)

In [17]:
def get_predicted_sentence(input_seq):
    # Encode the input as state vectors.
    enc_output, enc_h, enc_c = encoder_model.predict(input_seq)
  
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = mar_word_index['sos']
    
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [enc_output, enc_h, enc_c ])
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        # convert max index number to marathi word
        sampled_char = mar_index_word[sampled_token_index]
        # aapend it to decoded sent
        decoded_sentence += ' '+sampled_char
        
        # Exit condition: either hit max length or find stop token.
        if (sampled_char == 'eos' or len(decoded_sentence.split()) >= 36):
            stop_condition = True
        
        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
        
        # Update states
        enc_h, enc_c = h, c
    
    return decoded_sentence

In [24]:
def get_german_sentence(input_sequence):
    sentence =''
    for i in input_sequence:
      if i!=0 :
        sentence =sentence + Gword2index[i]+' '
    return sentence 

# same as above we want input english sentence so create function
def get_english_sentence(input_sequence):
    sentence =''
    for i in input_sequence:
      if i!=0:
        sentence =sentence + Eword2index[i] +' '
    return sentence     

# using simple loop we will take 15 random numbers from x_test and get results
for i in np.random.randint(10, 1000, size=15):
  print("English Sentence:",get_english_sentence(X_test[i]))
  print("Actual German Sentence:",get_german_sentence(y_test[i])[4:-4])
  # Before passing input it has to be reshape as following
  print("Predicted German Translation:",get_predicted_sentence(X_test[i]))
  print("----------------------------------------------------------------------------------------")
   

English Sentence: please smile 
Actual German Sentence: e lch


ValueError: ignored