In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

In [None]:
import tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Embedding,Dense, LSTM, Bidirectional,RepeatVector, GRU, Dropout, TimeDistributed
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.losses import SparseCategoricalCrossentropy

import re
from string import punctuation
from collections import Counter
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split

In [None]:
df  = pd.read_csv

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
eng = df['English words/sentences']
fra = df['French words/sentences']

In [None]:
eng_word_counter = Counter([word for sentence in eng for word in sentence.split()])
print('Total count of English words: ', len([word for sentence in eng for word in sentence.split()]))
print('Count of Distinct English words: ',len(eng_word_counter))
print('10 Most Common words in English: ',list(zip(*eng_word_counter.most_common(10)))[0])

In [None]:
fra_word_counter = Counter([word for sentence in fra for word in sentence.split()])
print('Total count of French words: ', len([word for sentence in fra for word in sentence.split()]))
print('Count of distinct French wors: ',len(fra_word_counter))
print('10 most common words in French: ',list(zip(*fra_word_counter.most_common(10)))[0])

In [None]:
def clean(string):
    string = string.replace("\u202f"," ")
    string = string.lower()
    
    for p in punctuation + "«»" + "0123456789":
        string  =  string.replace(p," ")
        
    string = re.sub('\s+',' ',string)
    return string

In [None]:
eng = eng.apply(lambda x:clean(x))
fra = fra.apply(lambda x:clean(x))

In [None]:
plt.figure(figsize=(15,12))
wc= WordCloud(width=600, height =300).generate(' '.join(eng))
plt.imshow(wc)
plt.show();

In [None]:
plt.figure(figsize=(15,12))
wcf = WordCloud(width=600,height=300).generate(' '.join(fra))
plt.imshow(wcf)
plt.show();

In [None]:
def word_count(line):
    return len(line.split())

In [None]:
df['English_word_count'] = df['English words/sentences'].apply(lambda x: word_count(x))
df['French_word_count'] = df['French words/sentences'].apply(lambda x: word_count(x))

In [None]:
fig, axes = plt.subplots(nrows=1,ncols=2)
sns.distplot(df['English_word_count'],ax=axes[0])
sns.distplot(df['French_word_count'],ax=axes[1])
sns.despine()
plt.show();

In [None]:
#TEXT PREPROCESSING FUNCTIONS FOR MODEL TRAINING

#Tokenizing Text 
def create_tokenizer(sentences):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)
    return tokenizer


#Finding the maximum sentence length of a language text
def max_sentence_length(lines):
    return max(len(sentence.split()) for sentence in lines)


#Token sequencing and Padding
def encode_sequences(tokenizer,sentences,max_sent_len):
    text_to_seq = tokenizer.texts_to_sequences(sentences)
    text_pad_sequences = pad_sequences(text_to_seq, maxlen = max_sent_len, padding='pre')
    return text_pad_sequences

In [None]:
eng_tokenizer = create_tokenizer(eng)
eng_vocab_size = len(eng_tokenizer.word_index)+1
max_eng_sent_len = max_sentence_length(eng)
print('ENGLISH :')
print('Maximum length of sentence in English :', max_eng_sent_len)
print('English text vocabulary size :', eng_vocab_size)
print('--------------------------------------------')

#For French Text - Tokenizer
fra_tokenizer = create_tokenizer(fra)
fra_vocab_size= len(fra_tokenizer.word_index)+1
max_fra_sent_len = max_sentence_length(fra)
print('FRENCH :')
print('Maximum length of sentence in French :',max_fra_sent_len)
print('French text vocabulary size :',fra_vocab_size)

In [None]:
max_eng_sent_len = 25
max_fra_sent_len = 25

In [None]:
X = encode_sequences(eng_tokenizer, eng, max_eng_sent_len)
y = encode_sequences(fra_tokenizer, fra, max_fra_sent_len)

In [None]:
def create_model(inp_vocab_size, out_vocab_size, inp_maxlen, out_maxlen):
    model = Sequential()
    model.add(Embedding(inp_vocab_size, 512,input_length = inp_maxlen, mask_zero=True))
    model.add(LSTM(512))
    model.add(RepeatVector(out_maxlen))
    model.add(LSTM(512,return_sequences=True))
    model.add(TimeDistributed(Dense(1024,activation='relu')))
    model.add(Dropout(0.3))
    model.add(TimeDistributed(Dense(out_vocab_size,activation='softmax')))
    return model

In [None]:
model = create_model(eng_vocab_size, fra_vocab_size, max_eng_sent_len, max_fra_sent_len)
model.summary()

In [None]:
model.compile(loss=SparseCategoricalCrossentropy(),optimizer='adamax',metrics='accuracy')
es = EarlyStopping(monitor='val_accuracy',patience=5,mode='max',verbose=1)
lr = ReduceLROnPlateau(monitor='val_accuracy',patience=3,mode='max',verbose=1,factor=0.1,min_lr=0.001)
history = model.fit(X_train,
                    y_train.reshape(y_train.shape[0],y_train.shape[1],1),
                    epochs=6,
                    batch_size=512,
                    callbacks=[es,lr],
                    validation_data = (X_test,y_test.reshape(y_test.shape[0],y_test.shape[1],1))
                   )