# Translating English to Spanish 

https://www.kaggle.com/code/aiswaryaramachandran/english-to-hindi-neural-machine-translation/notebook

In [44]:
import os
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model


# Loading the data

In [5]:
df = pd.read_csv('/content/drive/MyDrive/AI_Research/English2Spanish.csv')
df.tail()

Unnamed: 0,english,spanish
118959,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...
118960,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...
118961,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
118962,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...
118963,"If you want to sound like a native speaker, yo...","Si quieres sonar como un hablante nativo, debe..."


In [6]:
df.shape

(118964, 2)

In [7]:
df.isnull().mean()

english    0.0
spanish    0.0
dtype: float64

# Preprocessing

In [9]:
# Convert to lover case
df['english'] = df['english'].map(lambda x: x.lower())
df['spanish'] = df['spanish'].map(lambda x: x.lower())

In [11]:
# Remove Quotes
df['english'] = df['english'].map(lambda x: re.sub("'",'',x))
df['spanish'] = df['spanish'].map(lambda x:re.sub("'",'',x))

In [13]:
# Remove all Special characters
exclude = set(string.punctuation)
df['english'] = df['english'].map(lambda x: ''.join(ch for ch in x if ch not in exclude))
df['spanish'] = df['spanish'].map(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [21]:
# Remove all numbers from text 
remove_digits = str.maketrans('','',digits)
df['english'] = df['english'].map(lambda x: x.translate(remove_digits))
df['spanish'] = df['spanish'].map(lambda x: x.translate(remove_digits))

In [22]:
# Remove Extra Spaces
df['english'] = df['english'].map(lambda x: x.strip())
df['spanish'] = df['spanish'].map(lambda x: x.strip())

df['english'] = df['english'].map(lambda x: re.sub(" +"," ",x))
df['spanish'] = df['spanish'].map(lambda x: re.sub(" +"," ",x))

### Add Tokenns (START and END)

In [23]:
df['spanish'] = df['spanish'].map(lambda x: 'START_'+x+'_END')

In [24]:
# Get English and Spanish Vocabs

all_english_words = set()
for sent in df['english']:
  for word in sent.split():
    if word not in all_english_words:
      all_english_words.add(word)

all_spanish_words = set()
for sent in df['spanish']:
  for word in sent.split():
    if word not in all_spanish_words:
      all_spanish_words.add(word)
  

In [25]:
len(all_english_words),len(all_spanish_words)

(13370, 38376)

In [32]:
df['length_english_sentence']=df['english'].apply(lambda x:len(x.split(" ")))
df['length_spanish_sentence']=df['spanish'].apply(lambda x:len(x.split(" ")))

In [35]:
df[df['length_english_sentence']>30].shape

(14, 4)

In [36]:
df=df[df['length_english_sentence']<=20]
df=df[df['length_spanish_sentence']<=20]

In [37]:
df.shape

(118787, 4)

In [38]:
max_length_src=max(df['length_english_sentence'])
max_length_tar=max(df['length_spanish_sentence'])

## Next,we create 4 Python dictionaries to convert a given word into an integer index and vice-versa.

In [39]:
input_words = sorted(list(all_english_words))
target_words = sorted(list(all_spanish_words))

num_encoder_tokens = len(all_english_words)
num_decoder_tokens = len(all_spanish_words)

num_encoder_tokens,num_decoder_tokens

(13370, 38376)

In [41]:
num_decoder_tokens+=1 # for zero padding
num_decoder_tokens

38378

In [42]:
input_token_index = dict([(word,i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word,i+1) for i, word in enumerate(target_words)])

In [43]:
reverse_input_char_index = dict((i,word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i,word) for word, i in target_token_index.items())

In [45]:
df = shuffle(df)
df.head(10)

Unnamed: 0,english,spanish,length_english_sentence,length_spanish_sentence
67625,you mustnt stay out that late,START_no debes quedarte fuera tan tarde_END,6,6
31114,he is a man of ability,START_él es un hombre talentoso_END,6,5
23652,i cant afford a car,START_no puedo permitirme un coche_END,5,5
8777,please take one,START_coge una_END,3,2
33782,tom has never met mary,START_tom jamás ha visto a mary_END,5,6
59119,tom really misses his family,START_tom añora de veras a su familia_END,5,7
44515,i have no time to see you,START_no tengo tiempo para verte_END,7,5
11338,open the windows,START_abrí las ventanas_END,3,3
314,im full,START_ya me llené_END,2,3
35952,i got up early as usual,START_me levanté temprano como siempre_END,6,5


# Split the data

In [46]:
X,y = df['english'],df['spanish']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape,X_test.shape

((95029,), (23758,))

### Saving this data

In [47]:
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')

# Generate Batch

In [48]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

# Encoder-Decoder Architecture

In [49]:
latent_dim=300

## Encoder

In [50]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

## Decoder

In [52]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Create the Model

In [53]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [54]:
# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [55]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 300)    4011000     ['input_1[0][0]']                
                                                                                                  
 embedding_2 (Embedding)        (None, None, 300)    11513400    ['input_3[0][0]']                
                                                                                            

In [57]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 100

train_samples,val_samples

(95029, 23758)

In [None]:
# Fit the model
import numpy as np
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)

  model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),


Epoch 1/100

# Saving Model Weights

In [None]:
model.save_weights('nmt_weights.h5')

# Making Predictions

In [None]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1

In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])