# Neural Language Translation 

In [247]:
import pandas as pd
import numpy as np

In [248]:
f = open('Tamil.txt', 'r+', encoding="utf8")
x = f.readlines()

In [249]:
f = open('English.txt', 'r+', encoding="utf8")
y = f.readlines()

### I am using only 500 examples to train pretty fast (I'm lazy FYI)

In [250]:
x = x[-500:]

In [251]:
len(x)

500

In [252]:
y = y[-500:]

### Text Pre-processing

In [253]:
x[0]= x[0].strip('\ufeffMMA')
y[0]= y[0].strip('\ufeffMMA')

In [254]:
import string
exclude = set(string.punctuation)
for i in range(0,len(x)):
    x[i] = x[i].strip('\n')
    x[i] = ''.join(ch for ch in x[i] if ch not in exclude)

In [255]:
for i in range(0,len(y)):
    y[i] = y[i].lower()
    y[i] = y[i].strip('\n')
    y[i] = ''.join(ch for ch in y[i] if ch not in exclude)

### Cleaned text

In [256]:
print("Tamil Text:",x[1],"\n")
print("English Text:",y[1])

Tamil Text: தன்னுடைய இராஜிநாமாவை பற்றிய அறிக்கையில் பெக் சதித்திட்டங்கள் பற்றியும் அவர் எதிர்கொண்ட வேண்டுமென்றே கொடுக்கப்பட்ட தவறான தகவல்கள் பற்றியும் குறிப்பிட்டுள்ளார் 

English Text: in a personal statement about his resignation beck spoke about the plots and deliberate misinformation he had faced


In [257]:
len(x)

500

In [258]:
len(y)

500

In [259]:
english_words = []
for i in range(0,len(y)):
    english_words.append(y[i].split())   

In [260]:
english_words = [j for sub in english_words for j in sub]

In [261]:
print("Number of Unique English words:",len(set(english_words)))

Number of Unique English words: 3448


In [262]:
tamil_words = []
for i in range(0,len(x)):
    tamil_words.append(x[i].split())  
tamil_words = [j for sub in tamil_words for j in sub]

In [263]:
print("Number of Unique Tamil words:",len(set(tamil_words)))

Number of Unique Tamil words: 5260


In [264]:
tamilvocab = len(set(tamil_words))
engvocab = len(set(english_words))

### Looks like there are more number of unique words in Tamil.....as expected

In [265]:
length_tamil=[]
for i in range(0,len(x)):
    length_tamil.append(len(x[i].split()))

In [266]:
length_english=[]
for i in range(0,len(y)):
    length_english.append(len(y[i].split()))

### Average number of words in each sentence

In [267]:
sum(length_english)/len(length_english)

23.11

In [268]:
sum(length_tamil)/len(length_tamil)

16.052

In [270]:
print(max(length_english))
print(max(length_tamil))

69
53


In [271]:
import collections
english_words_counter = collections.Counter([word for sentence in y for word in sentence.split()])
tamil_words_counter = collections.Counter([word for sentence in x for word in sentence.split()])

### Most common words in both languages

In [272]:
english_words_counter.most_common(10)

[('the', 950),
 ('of', 442),
 ('and', 395),
 ('to', 314),
 ('in', 296),
 ('a', 205),
 ('that', 137),
 ('is', 129),
 ('for', 106),
 ('on', 95)]

In [273]:
tamil_words_counter.most_common(10)

[('ஒரு', 90),
 ('என்று', 69),
 ('மற்றும்', 56),
 ('இந்த', 44),
 ('என்ற', 37),
 ('அவர்', 30),
 ('அமெரிக்க', 26),
 ('நான்', 24),
 ('அவர்கள்', 23),
 ('என', 23)]

# Tokenizer:

Now that our corpus is ready we have to represent it in a way that the neural network can understand, So we convert the text representation to number representation. In words based representation each word his assigned a number abd in character based representation each character is assigned a number. I am using a word level model for its simpler complexity

Keras Tokenizer simplifies the representation process for us (This class allows to vectorize a text corpus, by turning each text into either a sequence of integers)


###  Setting gpu use to 0.3 as maximum gpu usage by CUDA results in internal error 

In [274]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))

In [275]:
from keras.preprocessing.text import Tokenizer
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x) 
    return tokenizer.texts_to_sequences(x), tokenizer

In [276]:
x[0]

'தெற்கு ஈராக்கிலுள்ள பிரிட்டிஷ் படைகள் ஒரு எதிரி இராணுவத்தை எதிர்கொள்ளவில்லை ஆனால் தங்களது நாட்டை வெளிநாட்டவர் ஆக்கிரமிப்பு செய்திருப்பதற்கு ஆழ்ந்த எதிர்ப்பை தெரிவிக்கும் மக்களை அது எதிர்கொண்டுள்ளது'

In [277]:
z=(tokenize(x))
z[0][0]

[444,
 927,
 445,
 270,
 1,
 928,
 929,
 930,
 15,
 446,
 931,
 932,
 447,
 933,
 271,
 182,
 934,
 272,
 11,
 935]

In [278]:
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',]
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'brown': 3, 'fox': 4, 'jumps': 5, 'over': 6, 'lazy': 7, 'dog': 8, 'by': 9, 'jove': 10, 'my': 11, 'study': 12, 'of': 13, 'lexicography': 14, 'won': 15, 'a': 16, 'prize': 17}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 3, 4, 5, 6, 1, 7, 8]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [9, 10, 11, 2, 12, 13, 14, 15, 16, 17]


# Padding:

When batching the sequence of word ids together, each sequence needs to be the same length. Since sentences are dynamic in length, we can add padding to the end of the sequences to make them the same length.




In [279]:
from keras.preprocessing.sequence import pad_sequences
def pad(x, length=None):
    return pad_sequences(x, maxlen=length, padding='post')

In [280]:
test_pad = pad(text_tokenized)
print("OUTPUT IS ALWAYS A LENGTH 10 ARRAY....FILLED BY 0s IN THE END")
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

OUTPUT IS ALWAYS A LENGTH 10 ARRAY....FILLED BY 0s IN THE END
Sequence 1 in x
  Input:  [1 2 3 4 5 6 1 7 8]
  Output: [1 2 3 4 5 6 1 7 8 0]
Sequence 2 in x
  Input:  [ 9 10 11  2 12 13 14 15 16 17]
  Output: [ 9 10 11  2 12 13 14 15 16 17]


### Apply all the tested preprocessing functions to our corpus

In [281]:
def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    print('shape before: ', preprocess_y.shape)
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    print('shape after: ', preprocess_y.shape)
        
    return preprocess_x, preprocess_y, x_tk, y_tk

In [282]:
preprocess_x, x_tk = tokenize(x)
preprocess_x = pad(preprocess_x)
preprocess_x.shape

(500, 53)

In [283]:
preproc_tamil_sentences, preproc_english_sentences, tamil_tokenizer, english_tokenizer =\
    preprocess(x, y)

shape before:  (500, 69)
shape after:  (500, 69, 1)


## Assigning a number to each word

In [284]:
list(tamil_tokenizer.word_index.items())[:5]

[('ஒரு', 1), ('என்று', 2), ('மற்றும்', 3), ('இந்த', 4), ('என்ற', 5)]

In [285]:
list(english_tokenizer.word_index.items())[:5]

[('the', 1), ('of', 2), ('and', 3), ('to', 4), ('in', 5)]

# Logits to text

The neural network will be translating the input to words ids, which isn't the final form we want. We want the Tamil translation. The function logits_to_text will bridge the gab between the logits from the neural network to the Tamil translation.



In [286]:
def logits_to_text(logits, tokenizer):

    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [287]:
import os
from keras.models import load_model
import numpy as np

In [288]:
print("tamil_sentences shape: ", preproc_english_sentences.shape)
print("english_sentences  shape: ", preproc_tamil_sentences.shape)
print('output sequence length: ', preproc_english_sentences.shape[1])

tamil_sentences shape:  (500, 69, 1)
english_sentences  shape:  (500, 53)
output sequence length:  69


In [289]:
tmp_x = pad(preproc_tamil_sentences, preproc_english_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_english_sentences.shape[-2], 1))

In [290]:
tmp_x.shape

(500, 69, 1)

In [291]:
from keras.layers import GRU, Input, Dense, TimeDistributed
from keras.models import Model, Sequential
from keras.layers import Activation
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy


def simple_model(input_shape, output_sequence_length, tamil_vocab_size, english_vocab_size, learning_rate=0.1):
    model = Sequential()
    model.add(GRU(128, dropout=0.1,input_shape=input_shape[1:], return_sequences=True) )
    model.add(TimeDistributed(Dense(english_vocab_size, activation='softmax') ))
    print('######## Summary ###########')
    model.summary()
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

In [292]:
simple_rnn_model = simple_model(
    tmp_x.shape,
    preproc_english_sentences.shape[1],
    engvocab+1,
    tamilvocab+1)
if os.path.exists(os.path.join("cache", "simple_model.h5"))== False:
    simple_rnn_model.fit(tmp_x, preproc_english_sentences, batch_size=1024, epochs=20, validation_split=0.2)
else:
    simple_rnn_model = load_model(os.path.join("cache", "simple_model.h5"))

######## Summary ###########
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_5 (GRU)                  (None, 69, 128)           49920     
_________________________________________________________________
time_distributed_5 (TimeDist (None, 69, 5261)          678669    
Total params: 728,589
Trainable params: 728,589
Non-trainable params: 0
_________________________________________________________________
Train on 400 samples, validate on 100 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Save the model 

In [298]:
simple_rnn_model.save(os.path.join("model", "translate.h5"))

In [299]:
score = simple_rnn_model.evaluate(tmp_x, preproc_english_sentences, verbose=0)
print("Train accurancy: ", score[1])

Train accurancy:  0.6741739158630371


In [300]:
score

[2.9074899463653563, 0.6741739158630371]

### Pretty decent for a small dataset

In [317]:
x[0]

'தெற்கு ஈராக்கிலுள்ள பிரிட்டிஷ் படைகள் ஒரு எதிரி இராணுவத்தை எதிர்கொள்ளவில்லை ஆனால் தங்களது நாட்டை வெளிநாட்டவர் ஆக்கிரமிப்பு செய்திருப்பதற்கு ஆழ்ந்த எதிர்ப்பை தெரிவிக்கும் மக்களை அது எதிர்கொண்டுள்ளது'

In [305]:
## PRETTY BAD TRANSLATION :(
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], english_tokenizer))

the the the the petroleum response response response a a a a a a a a a a the a of <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


### Pretty sure more data is necessary and also need to penalise stopwords

### Also it is possible to use word embeddings, encoder-decoder or even Bi-directional LSTM's

### Reference:

https://github.com/Barqawiz/aind2-nlp-capstone-translation/blob/master/machine_translation.ipynb

