# Machine Translation:    ENGLISH TO GERMAN
# (Encoder-Decoder)

<!-- Add the image -->
![Alt Text](encode_decode.jpg)

# Import and preprocessing the data


In [2]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model
import numpy as np

# Load and preprocess the data
data = pd.read_csv('/kaggle/input/hey-buddy/GERMAN_ENGLISH_TRANSLATION.csv')
data = data.drop_duplicates(subset=['ENGLISH'])
data = data.head(20000)

english_sentences = data['ENGLISH'].str.lower().str.replace('[^\w\s]', '').tolist()
german_sentences = data['GERMAN'].str.lower().str.replace('[^\w\s]', '').apply(lambda x: '<start> ' + x + ' <end>').tolist()



  english_sentences = data['ENGLISH'].str.lower().str.replace('[^\w\s]', '').tolist()
  german_sentences = data['GERMAN'].str.lower().str.replace('[^\w\s]', '').apply(lambda x: '<start> ' + x + ' <end>').tolist()


### sample of data

In [3]:
print(data.head())

   Unnamed: 0 ENGLISH      GERMAN
0           0      hi       hallo
2           2     run        lauf
3           3     wow  potzdonner
5           5    fire       feuer
6           6    help       hilfe


In [4]:
print(german_sentences[0:5])

['<start> hallo <end>', '<start> lauf <end>', '<start> potzdonner <end>', '<start> feuer <end>', '<start> hilfe <end>']


# Generate Tokenization

In [5]:
# Integer encode sentences
eng_token = Tokenizer(filters='')
eng_token.fit_on_texts(english_sentences)
eng_token_ind = eng_token.texts_to_sequences(english_sentences)

ger_token = Tokenizer(filters='')
ger_token.fit_on_texts(german_sentences)
ger_token_ind= ger_token.texts_to_sequences(german_sentences)

# Pad encoded sentences
max_encoder_seq_length = max([len(seq) for seq in eng_token_ind])
max_decoder_seq_length = max([len(seq) for seq in ger_token_ind])
print(max_encoder_seq_length,max_decoder_seq_length)

6 13


### Create Encoder (input data) and decoder (input,target) data

In [6]:
encoder_input_data = pad_sequences(eng_token_ind, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(ger_token_ind, maxlen=max_decoder_seq_length, padding='post')

# target data for the decoder
decoder_target_data = []
for seq in ger_token_ind:
    decoder_target_data.append(seq[1:])
decoder_target_data = pad_sequences(decoder_target_data, maxlen=max_decoder_seq_length, padding='post')

num_decoder_tokens = len(ger_token.word_index) + 1
decoder_output = np.zeros((len(ger_token_ind), max_decoder_seq_length, num_decoder_tokens), dtype='float32')
for i, seq in enumerate(decoder_target_data):
    for t, token in enumerate(seq):
        decoder_output[i, t, token] = 1

# Architecture- LSTM

In [7]:
# Create the model
latent_dim = 256
num_encoder_tokens = len(eng_token.word_index) + 1

eng_embedding_layer = Embedding(num_encoder_tokens, latent_dim)
ger_embedding_layer = Embedding(num_decoder_tokens, latent_dim)

encoder_inputs = Input(shape=(None,))
encoder_embedding = eng_embedding_layer(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
decoder_embedding = ger_embedding_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Train the model

In [8]:

# Compile and train the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_output,
          batch_size=64,
          epochs=60,
          validation_split=0.2)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<keras.callbacks.History at 0x7da4d0490940>

# Create the encoder and decoder models for inference

In [9]:
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.save('encoder_model.h5')
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_embedding_inference = ger_embedding_layer(decoder_inputs)
decoder_outputs_inference, state_h_inference, state_c_inference = decoder_lstm(decoder_embedding_inference,
                                                                               initial_state=decoder_states_inputs)
decoder_states_inference = [state_h_inference, state_c_inference]
decoder_outputs_inference = decoder_dense(decoder_outputs_inference)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                      [decoder_outputs_inference] + decoder_states_inference)
decoder_model.save('decoder_model.h5')

In [10]:
import json

# Save eng_tokenizer
eng_token_dict = {"word_index": eng_token.word_index}
with open('eng_tokenizer.json', 'w') as f:
    json.dump(eng_token_dict, f)

# Save ger_tokenizer
ger_token_dict = {"word_index": ger_token.word_index}
with open('ger_tokenizer.json', 'w') as f:
    json.dump(ger_token_dict, f)


In [11]:
import json

# Save ger_token's index_word dictionary
ger_token_dict = {"index_word": ger_token.index_word}
with open('ger_token_index_word.json', 'w') as f:
    json.dump(ger_token_dict, f)


## Decode the sentence

In [12]:
# Function to decode a new sentence
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = ger_token.word_index['<start>']
    stop_condition=False
    decoded_sentence=''
    while not stop_condition:
        output_tokens,h,c=decoder_model.predict([target_seq]+states_value,verbose=0)
        sampled_token_index=np.argmax(output_tokens[0,-1,:])
        sampled_word=ger_token.index_word[sampled_token_index]
        if sampled_word != '<end>':
            decoded_sentence += ' '+sampled_word

        if (sampled_word == '<end>' or len(decoded_sentence.split()) > max_decoder_seq_length):
            stop_condition=True

        target_seq=np.zeros((1,1))
        target_seq[0,0]=sampled_token_index

        states_value=[h,c]
    return decoded_sentence

# Testing the model

In [13]:
listt=[
    'hello',
    'I won',
    'Go Away',
    'I gave up',
    'I am a man',
    'Tom really got a bad deal',
    'he will go with us',
    'what are you reading',
    'hi',
    'The reason does not matter',
    'I love your cat',
    'go',
    'get inside'
]
actual=[
    'hallo',
    'ich habe gewonnen',
    'geh weg',
    'ich gab auf',
    'ich bin ein Mann',
    'tom hat wirklich ein schlechtes Geschäft gemacht',
    'er wird mit uns gehen',
    'was liest du?',
    'hallo',
    'der Grund ist egal',
    'ich liebe deine Katze',
    'geh',
    'Komm herein'
]
for i in range(len(listt)):
    new_english_sentence=listt[i]
    new_english_sentence.lower().replace('[^\w\s]', '')
    new_eng_integer_encoded=eng_token.texts_to_sequences([new_english_sentence])
    new_encoder_input_data=pad_sequences(new_eng_integer_encoded,maxlen=max_encoder_seq_length,padding='post')
    decoded_sentence=decode_sequence(new_encoder_input_data)
    decoded_sentence=decoded_sentence.strip()
    print('Input sentence:', new_english_sentence)
    print('Actual sentence:', actual[i])
    print('Decoded sentence:', decoded_sentence)



Input sentence: hello
Actual sentence: hallo
Decoded sentence: hallo
Input sentence: I won
Actual sentence: ich habe gewonnen
Decoded sentence: ich hab gewonnen
Input sentence: Go Away
Actual sentence: geh weg
Decoded sentence: geh weg
Input sentence: I gave up
Actual sentence: ich gab auf
Decoded sentence: ich habe gekotzt
Input sentence: I am a man
Actual sentence: ich bin ein Mann
Decoded sentence: ich bin ein mann
Input sentence: Tom really got a bad deal
Actual sentence: tom hat wirklich ein schlechtes Geschäft gemacht
Decoded sentence: tom hat wirklich schlecht
Input sentence: he will go with us
Actual sentence: er wird mit uns gehen
Decoded sentence: er soll mit dem gehen
Input sentence: what are you reading
Actual sentence: was liest du?
Decoded sentence: was liest mich
Input sentence: hi
Actual sentence: hallo
Decoded sentence: hallo
Input sentence: The reason does not matter
Actual sentence: der Grund ist egal
Decoded sentence: der wirtschaft geht es mir schlecht
Input senten

# Bleu Score of model Translation

In [14]:
from nltk.translate.bleu_score import corpus_bleu
references = []
hypotheses = []
for i in range(len(listt)):
    input_seq=listt[i]
    input_seq.lower().replace('[^\w\s]', '')
    new_eng_integer_encoded=eng_token.texts_to_sequences([input_seq])
    new_encoder_input_data=pad_sequences(new_eng_integer_encoded,maxlen=max_encoder_seq_length,padding='post')
    actual_sentence = actual[i]
    decoded_sentence = decode_sequence(new_encoder_input_data)
    actual_sentence=actual_sentence.strip()
    decoded_sentence=decoded_sentence.strip()
    references.append([actual_sentence.split()])
    hypotheses.append(decoded_sentence.split())
bleu_score = corpus_bleu(references, hypotheses)
print('BLEU score:', bleu_score)

BLEU score: 0.3805647320367025


Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
