In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import os

# Data Preprocessing
We will convert the dataencoding characters and replace few characters with spaces that we dnt want to include in the dataset.

In [2]:
import unicodedata
def unicode_to_ascii(line):
  ascii_line=''.join(c  for c in unicodedata.normalize('NFD', line) if unicodedata.category(c) != 'Mn')
  return ascii_line

In [3]:
import re
def preprocessing(line):
  line = unicode_to_ascii(line)

  w = re.sub(r"([?.!,¿])", r" \1 ", line)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [4]:
english,german=[],[]
with open('deu.txt','r',encoding='utf-8',errors='ignore') as deu:
  deulines = deu.read().strip().split('\n')
  for lines in deulines:
    deu_sent=lines.split('\t')
    english.append(preprocessing(deu_sent[0]))
    german.append(preprocessing( deu_sent[1]))

    #print(lines)
    

In [5]:
english[-1]

'<start> Doubtless there exists in this world precisely the right woman for any given man to marry and vice versa but when you consider that a human being has the opportunity of being acquainted with only a few hundred people , and out of the few hundred that there are but a dozen or less whom he knows intimately , and out of the dozen , one or two friends at most , it will easily be seen , when we remember the number of millions who inhabit this world , that probably , since the earth was created , the right man has never yet met the right woman . <end>'

In [6]:
german[-1]

'<start> Ohne Zweifel findet sich auf dieser Welt zu jedem Mann genau die richtige Ehefrau und umgekehrt wenn man jedoch in Betracht zieht , dass ein Mensch nur Gelegenheit hat , mit ein paar hundert anderen bekannt zu sein , von denen ihm nur ein Dutzend oder weniger nahesteht , darunter hochstens ein oder zwei Freunde , dann erahnt man eingedenk der Millionen Einwohner dieser Welt leicht , dass seit Erschaffung ebenderselben wohl noch nie der richtige Mann der richtigen Frau begegnet ist . <end>'

In [7]:
len(english)

208486

In [8]:
##lets take 30k samples for training
english=english[:30000]
german= german[:30000]

In [9]:
len(english)

30000

In [10]:
### Tokenization and pad sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
def tokenize(language):
  token= Tokenizer(filters=' ')
  token.fit_on_texts(language)
  lang_seq=token.texts_to_sequences(language)
  lang_pad = pad_sequences(lang_seq, padding='post')
  return token, lang_pad

In [11]:
english_token, english_pad = tokenize(english)
german_token, german_pad = tokenize(german)

In [12]:
english_token.word_index['lady']

2808

In [13]:
english_pad[-255]

array([  1,  58,  25,   5, 304,   9,   6,   2,   0,   0], dtype=int32)

# dataset creation

In [14]:
batch_size = 64
buffer_size=len(english_pad)
steps_per_epoch= int( buffer_size // batch_size)
embed_dim = 256
units = 1024
english_vocab_size = len(english_token.word_index) + 1
german_vocab_size = len(german_token.word_index) + 1

In [15]:
dataset = tf.data.Dataset.from_tensor_slices((german_pad, english_pad)).shuffle(buffer_size)
dataset = dataset.batch(batch_size , drop_remainder =True)

In [16]:
german_dataset, english_dataset = next(iter(dataset))

In [17]:
german_dataset.shape, english_dataset.shape

(TensorShape([64, 14]), TensorShape([64, 10]))

# Encoder Decoder and Attention Layer
We are creating Encoder and decoder with attention layer

In [18]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embed_dim, units, batch_size):
    super(Encoder, self).__init__()
    ## assign variables
    self.vocab_size=vocab_size
    self.embed_dim=embed_dim
    self.units=units
    self.batch_size=batch_size

    self. embed=tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = embed_dim)

    self.gru = tf.keras.layers.GRU(units, return_sequences = True, return_state= True,
                                   recurrent_initializer = 'glorot_uniform')
    
  def call(self, encoder_input, encoder_hidden_state):

    x = self.embed(encoder_input)

    output, encoder_hidden_state=self.gru(x, initial_state = encoder_hidden_state)

    return output, encoder_hidden_state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_size, self.units))

In [19]:
encoder = Encoder(german_vocab_size, embed_dim, units, batch_size)
encoder_hidden_state= encoder.initialize_hidden_state()

In [20]:
encoder_output, encoder_hidden_state= encoder(german_dataset, encoder_hidden_state)

In [21]:
encoder_output.shape, encoder_hidden_state.shape

(TensorShape([64, 14, 1024]), TensorShape([64, 1024]))

# Attention layer

In [22]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.units= units

    self.w1 = tf.keras.layers.Dense(units)
    self.w2= tf.keras.layers.Dense(units)
    self.v=tf.keras.layers.Dense(1)

  def call(self, encoder_hidden_state, encoder_output):

    ##expand dimensions
    expand_encoder_hidden_state = tf.expand_dims(encoder_hidden_state, axis=1)

    x = self.v(tf.nn.tanh(self.w1(expand_encoder_hidden_state) + self.w2(encoder_output)))

    attention_weights = tf.nn.softmax(x, axis=1)

    context_vector = attention_weights * encoder_output


    context_vector= tf.reduce_sum(context_vector, axis =1 )

    return context_vector, attention_weights

In [23]:
attention= BahdanauAttention(units)
context_vector, attention_weights = attention.call(encoder_hidden_state, encoder_output)

In [24]:
context_vector.shape, attention_weights.shape

(TensorShape([64, 1024]), TensorShape([64, 14, 1]))

# Decoder

In [25]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embed_dim, units, batch_size):
    super(Decoder, self).__init__()
    self.vocab_size=vocab_size
    self.embed_dim=embed_dim
    self.units=units
    self.batch_size=batch_size

    self.embed=tf.keras.layers.Embedding(input_dim = vocab_size, output_dim= embed_dim)

    self.gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True,
                                   recurrent_initializer='glorot_uniform')
    
    self.fc= tf.keras.layers.Dense(vocab_size)
    self.attention = BahdanauAttention(units)

  def call(self, decoder_input, encoder_hidden_state, encoder_output):
    context_vector, attention_weights = self.attention.call(encoder_hidden_state, encoder_output)

    expand_context_vector = tf.expand_dims(context_vector, axis=1)

    x=self.embed(decoder_input)

    x = tf.concat([expand_context_vector, x], axis=-1)

    decoder_output, decoder_hidden_state= self.gru(x)
    #print(decoder_output.shape[2])

    decoder_output= tf.reshape(decoder_output, (-1, decoder_output.shape[2]))

    output = self.fc(decoder_output)

    return output, decoder_output, attention_weights

    

In [26]:
decoder = Decoder(english_vocab_size, embed_dim, units, batch_size)

In [27]:
decoder_output, _,_=decoder.call(tf.random.uniform((batch_size,1)),
                                 encoder_hidden_state, encoder_output)

In [28]:
decoder_output.shape

TensorShape([64, 4557])

# Loss Function

In [29]:
optimizer= tf.keras.optimizers.Adam()
loss_object= tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
def loss_function(real, pred):
  mask= tf.math.logical_not(tf.math.equal(real, 0))

  loss = loss_object(real, pred)


  mask = tf.cast(mask, dtype=loss.dtype)

  loss *=mask

  loss = tf.reduce_mean(loss)

  return loss

In [30]:
##Create checkpoints
checkpoint_dir= './training_checkpoints'
checkpoint_prefix=os.path.join(checkpoint_dir,'ckpt')
checkpoint= tf.train.Checkpoint(optimizer=optimizer, encoder= encoder, decoder=decoder)


# Train model

In [31]:
def train_step(input_dataset, target_dataset, encoder_hidden_state):
  loss=0
  with tf.GradientTape() as tape:

    encoder_output, encoder_hidden_stae= encoder.call(input_dataset, encoder_hidden_state)

    dcoder_hidden_state= encoder_hidden_state

    decoder_input= tf.expand_dims([german_token.word_index['<start>']] * batch_size, axis=1)

    ##loop the target dataset
    for t in range(1, int(target_dataset.shape[1])):
      predictions, decoder_hidden_state, attention_weights = decoder.call(decoder_input, 
                                                                          dcoder_hidden_state,
                                                                          encoder_output)
      loss += loss_function(target_dataset[:,t], predictions)
      
      decoder_input= tf.expand_dims(target_dataset[:,t] ,axis=1)

    batch_loss= loss / int(target_dataset.shape[1])

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradient = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradient, variables))

    return batch_loss

# Model Evaluation

In [33]:
epochs=10

for epoch in range(epochs):
  encoder_hidden_state = encoder.initialize_hidden_state()
  total_loss =0

  for (batch,(input_dataset, target_dataset)) in enumerate(dataset.take(steps_per_epoch)):

    batch_loss= train_step(input_dataset, target_dataset,encoder_hidden_state )
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  if epoch %2 ==0 and epoch != 0:
    checkpoint.save(checkpoint_prefix)
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))

Epoch 1 Batch 0 Loss 2.0472
Epoch 1 Batch 100 Loss 1.7264
Epoch 1 Batch 200 Loss 1.7149
Epoch 1 Batch 300 Loss 1.5356
Epoch 1 Batch 400 Loss 1.3650
Epoch 2 Batch 0 Loss 1.2776
Epoch 2 Batch 100 Loss 1.1671
Epoch 2 Batch 200 Loss 1.0900
Epoch 2 Batch 300 Loss 1.0896
Epoch 2 Batch 400 Loss 0.9779
Epoch 3 Batch 0 Loss 0.7874
Epoch 3 Batch 100 Loss 0.7983
Epoch 3 Batch 200 Loss 0.8506
Epoch 3 Batch 300 Loss 0.7073
Epoch 3 Batch 400 Loss 0.6806
Epoch 3 Loss 0.7398
Epoch 4 Batch 0 Loss 0.5026
Epoch 4 Batch 100 Loss 0.5487
Epoch 4 Batch 200 Loss 0.4873
Epoch 4 Batch 300 Loss 0.4571
Epoch 4 Batch 400 Loss 0.4433
Epoch 5 Batch 0 Loss 0.2999
Epoch 5 Batch 100 Loss 0.3848
Epoch 5 Batch 200 Loss 0.3140
Epoch 5 Batch 300 Loss 0.2388
Epoch 5 Batch 400 Loss 0.3029
Epoch 5 Loss 0.3097
Epoch 6 Batch 0 Loss 0.2543
Epoch 6 Batch 100 Loss 0.1666
Epoch 6 Batch 200 Loss 0.2533
Epoch 6 Batch 300 Loss 0.2045
Epoch 6 Batch 400 Loss 0.2037
Epoch 7 Batch 0 Loss 0.1163
Epoch 7 Batch 100 Loss 0.1373
Epoch 7 Batch 

# Evaluate

In [56]:
def evaluate(sentence):
  sentence= preprocessing(sentence)
  sentence_seq= german_token.texts_to_sequences([sentence])
  
  sentence_pad = pad_sequences(sentence_seq, maxlen=len(target_dataset[1]),padding='post')
  sentence_pad = tf.convert_to_tensor(sentence_pad)
  #print(sentence_pad)
  result=' '

  hidden_state= tf.zeros((1, units))
  encoder_output, encoder_hidden_state=encoder.call(sentence_pad, hidden_state)
  decoder_hidden_state = encoder_hidden_state
  decoder_input = tf.expand_dims([german_token.word_index['<start>']], axis=1)

  for t in range (target_dataset.shape[1]):
    predictions, decoder_hidden_state, attention_weights =decoder(decoder_input, decoder_hidden_state, 
                                                                  encoder_output)
    prediction_id= tf.argmax(predictions[0]).numpy()

    if english_token.index_word[prediction_id] == '<end>':
      return result

    result += english_token.index_word[prediction_id] + ' '
 
    decoder_input = tf.expand_dims([prediction_id], axis=1)

  return result




In [1]:
evaluate(u'ich mag bier')

NameError: ignored