In [2]:
!pip install gdown 

Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
Successfully installed gdown-4.7.1
[0m

In [3]:
!gdown --id 16nkAYoYkGRQk_bFY8z4E1087z0dElQbo

Downloading...
From: https://drive.google.com/uc?id=16nkAYoYkGRQk_bFY8z4E1087z0dElQbo
To: /kaggle/working/eng_-french.csv
100%|███████████████████████████████████████| 12.5M/12.5M [00:00<00:00, 206MB/s]


In [240]:
import pandas as pd 
import numpy as np 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization
from tensorflow.keras import optimizers

In [241]:
data = pd.read_csv("/kaggle/working/eng_-french.csv")
data

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
...,...,...
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç..."
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...


In [242]:
data['English words/sentences count'] = data['English words/sentences'].str.split().str.len()

In [243]:
data['French words/sentences count'] = data['French words/sentences'].str.split().str.len()

In [244]:
data

Unnamed: 0,English words/sentences,French words/sentences,English words/sentences count,French words/sentences count
0,Hi.,Salut!,1,1
1,Run!,Cours !,1,2
2,Run!,Courez !,1,2
3,Who?,Qui ?,1,2
4,Wow!,Ça alors !,1,3
...,...,...,...,...
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç...",34,47
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...,34,33
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...,37,47
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...,43,49


In [245]:
#removing all the sentences greater than 6 words  
data = data[(data['English words/sentences count']<6) & (data['French words/sentences count']<6)]

In [246]:
data

Unnamed: 0,English words/sentences,French words/sentences,English words/sentences count,French words/sentences count
0,Hi.,Salut!,1,1
1,Run!,Cours !,1,2
2,Run!,Courez !,1,2
3,Who?,Qui ?,1,2
4,Wow!,Ça alors !,1,3
...,...,...,...,...
153098,Tom's great-great-grandfather was a pirate.,L'arrière-arrière-grand-père de Tom était pirate.,5,5
153107,"Unfortunately, the information is accurate.","Malheureusement, l'information est exacte.",5,4
154934,They're having a going-out-of-business sale.,Ils ont une liquidation.,5,4
154962,This theory is scientifically controversial.,Cette théorie est scientifiquement controversée.,5,5


In [247]:
data['English words/sentences count'].max()

5

In [248]:
#hete the duplicates are zero 
data.duplicated().sum()

0

In [249]:
data = data.drop_duplicates()

# **Diving the Data in to Train Test Validate** 

In [250]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(data,test_size = 0.2,random_state = 42 )

In [251]:
train,valid = train_test_split(train,test_size= 0.2)

## Word Frequency

In [252]:
freq_x= train['English words/sentences'].str.split(expand=True).stack().value_counts().reset_index()
freq_y= train['French words/sentences'].str.split(expand=True).stack().value_counts().reset_index()
freq_x.to_csv('/kaggle/working/English_freq.csv',index=False) 
freq_y.to_csv('/kaggle/working/French_freq.csv',index=False) 

In [253]:
freq_x = pd.read_csv('/kaggle/working/English_freq.csv')
freq_y = pd.read_csv('/kaggle/working/French_freq.csv')

In [254]:
freq_x

Unnamed: 0,index,0
0,I,5994
1,you,3393
2,a,3328
3,is,2943
4,the,2292
...,...,...
10421,Kissing,1
10422,forceful,1
10423,ruins,1
10424,Diet,1


In [255]:
freq_y

Unnamed: 0,index,0
0,?,5204
1,Je,4625
2,est,2320
3,Il,2249
4,pas,2197
...,...,...
16322,draguer,1
16323,retrouver,1
16324,rentres,1
16325,Pensez,1


## **Data Prepration**

In [256]:
def get_data(raw_lines) : 
    text = [] 
    for raw_line in raw_lines:
        text.append('<start> '+ raw_line +' <end>')
    return text 

In [257]:
#english_train
#French_train
english_train = get_data(list(train['English words/sentences']))
french_train = get_data(list(train['French words/sentences']))

In [258]:
english_valid = get_data(list(valid['English words/sentences']))
french_valid = get_data(list(valid['French words/sentences']))

# **Tokenization**

In [259]:
#tokenisation and padding Input language 
fre_token = Tokenizer(filters='',lower=False)
fre_token.fit_on_texts(french_train)
fre_tokenized = fre_token.texts_to_sequences(french_train)
fre_padded = pad_sequences(fre_tokenized,padding='post')

#tokenisation and padding language language 
eng_token = Tokenizer(filters='',lower=False)
eng_token.fit_on_texts(english_train)
eng_tokenized = fre_token.texts_to_sequences(english_train)
eng_padded = pad_sequences(eng_tokenized,padding='post')

In [260]:
#number of input tokens in the input & output language
num_op_tokens = len(fre_token.word_index)
num_ip_tokens = len(eng_token.word_index)

In [261]:
print(num_op_tokens)
print(num_ip_tokens)

17221
10428


In [262]:
#maximum length of sentence in both sentences 
max_len_op = fre_padded.shape[1]
max_len_ip = eng_padded.shape[1]


In [263]:
print(max_len_op)
print(max_len_ip)

7
6


# **KText**

In [264]:
#from tensorflow.keras.preprocessing.sequence import pad_sequences
#from ktext.preprocess import processor

In [265]:
#english_pp = processor(keep_n=10429, padding_maxlen=6)
#english_train_vecs english_pp.fit_transform(english_train)

In [266]:
# # Title
# french_pp = processor (append_indicators=True, keep_n=17226, padding_maxlen=7, padding = 'post')
# # process the title data french_train
# import dill as_dpickle
# import numpy as np
# # Save the preprocessor
# with open('english_pp.dpkl', 'wb') as f: 
#     dpickle.dump(english_pp, f)
# with open('french_pp.dpk1', 'wb') as f: 
#     dpickle.dump(french_pp, f)
# # Save the processed data
# np.save('french_train_vecs.npy', french_train_vecs) 
#np.save('english_train_vecs.npy', english_train_vecs)_vecs french_pp.fit_transform(french_train)

In [267]:
# def load_decoder_inputs(decoder_np_vecs= 'french_train_vecs.npy'):
#     vectorized_title = np.load(decoder_np_vecs)
#     # For Decoder Input, you don't need the last word as that is only for prediction # when we are training using Teacher Forcing. 
#     decoder_input_data = vectorized_title[:, -1]
#     # Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing) 
#     decoder_target_data = vectorized_title[:, 1:]
#     print(f'Shape of decoder input: {decoder_input_data.shape}') 
#     print(f'Shape of decoder target: {decoder_target_data.shape}') 
#     return decoder_input_data, decoder_target_data

In [268]:
# def load_encoder_inputs(encoder_np_vecs= 'english_train_vecs.npy'):
#     vectorized_body = np.load(encoder_np_vecs)
#     # Encoder input is simply the body of the issue text encoder_input_data = vectorized_body
#     doc_length = encoder_input_data.shape[1] 
#     print (f'Shape of encoder input: {encoder_input_data.shape}') 
# return encoder_input_data, doc_length
   

In [269]:
# def load_text_processor (fname='english_pp.dpk1'):
# # Load files from disk 
# with open(fname, 'rb') as f:
#     pp dpickle.load(f)
# num_tokens = max(pp.id2token.keys()) + 1 
# print (f'Size of vocabulary for {fname}: {num_tokens:,}') 
# return num_tokens, pp


In [270]:
# num_encoder_tokens, english_pp = load_text_processor('english_pp.dpk1') 
# num_decoder_tokens, french_pp = load_text_processor('french_pp.dpk1')

# Machine Translation Model

In [271]:
latent_dim = 256

# Encoder model 
encoder_inputs = Input(shape=(max_len_ip,), name='Encoder-Input')

# Word embedding for encoder 
x = Embedding(num_ip_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
x = BatchNormalization(name='Encoder-Batchnorm-1')(x) 

# Intermediate GRU layer(optional)
_, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

# Encapsulate the encoder as a separate entity so we can just
# encode without decoding if we want to 
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')
seq2seq_encoder_out = encoder_model(encoder_inputs) 

In [272]:
# Decoder Model 
decoder_inputs = Input(shape=(None,), name='Decoder-Input')  # For teacher forcing 

# Word Embedding for the Decoder 
dec_emb = Embedding(num_op_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

# Set up the decoder, using `decoder_state_inputs` as initial state
decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)
x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense layer for prediction 
decoder_dense = Dense(num_op_tokens, activation='softmax',name="Final-Output-Dense")
decoder_outputs = decoder_dense (x)

In [273]:
#seq2seq_decoder_out = decoder_model([decoder_inputs, seq2seq_encoder_out]) 
seq2seq_Model = Model ([encoder_inputs, decoder_inputs], decoder_outputs)

In [274]:
seq2seq_Model.summary()

Model: "model_27"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Decoder-Input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 Decoder-Word-Embedding (Embedd  (None, None, 256)   4408576     ['Decoder-Input[0][0]']          
 ing)                                                                                             
                                                                                                  
 Encoder-Input (InputLayer)     [(None, 6)]          0           []                               
                                                                                                  
 Decoder-Batchnorm-1 (BatchNorm  (None, None, 256)   1024        ['Decoder-Word-Embedding[0

In [275]:
# Choose the data for training
encoder_input_train = eng_padded
decoder_input_train = fre_padded[:, :-1]
decoder_target_train = fre_padded[:, 1:]

# Train the model
batch_size = 64
epochs = 30
seq2seq_Model.compile(optimizer=optimizers.Adam(lr=0.001), loss='sparse_categorical_crossentropy')
seq2seq_Model.fit([encoder_input_train, decoder_input_train], decoder_target_train,
                  batch_size=batch_size, epochs=epochs, validation_split=0.2)

Epoch 1/30
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Cause: Unable to locate the source code of <function Model.make_test_function.<locals>.test_function at 0x78711d5572e0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: lineno is out of bounds
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30


<keras.callbacks.History at 0x786fdf7e4af0>

In [276]:
del seq2seq_Model

In [278]:
#seq2seq_decoder_out = decoder_model([decoder_inputs, seq2seq_encoder_out])
seq2seq_Model = Model ([encoder_inputs, decoder_inputs], decoder_outputs) 
seq2seq_Model.compile(optimizer=optimizers.Adam(lr=0.001), loss='sparse_categorical_crossentropy')

In [279]:
seq2seq_Model.summary()

Model: "model_28"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Decoder-Input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 Decoder-Word-Embedding (Embedd  (None, None, 256)   4408576     ['Decoder-Input[0][0]']          
 ing)                                                                                             
                                                                                                  
 Encoder-Input (InputLayer)     [(None, 6)]          0           []                               
                                                                                                  
 Decoder-Batchnorm-1 (BatchNorm  (None, None, 256)   1024        ['Decoder-Word-Embedding[0

In [281]:
#seq2seq_Model.load_weights ('tutorial_seq2seq.epoch23-val1.78877.hdf5')

In [292]:
def extract_encoder_model(model):
    encoder_model = model.get_layer('Encoder-Model')
    return encoder_model

def extract_decoder_model(model):
    # the Latent dimension is the same throughout the architecture so we are going to
    # cheat and grab the latent dimension of the embedding because that is the same as who
    #output from the decoder
    latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]
    # Reconstruct the input into the decoder 
    decoder_inputs = model.get_layer ('Decoder-Input').input
    dec_emb = model.get_layer('Decoder-Word-Embedding') (decoder_inputs) 
    dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
   
    gru_inference_state_input = Input(shape=latent_dim,name='hidden_state_input')
    gru_out, gru_state_out = model.get_layer('Decoder-GRU') ([dec_bn, gru_inference_state_input])
    
    # Reconstruct dense Layers
    dec_bn2 = model.get_layer ('Decoder-Batchnorm-2')(gru_out) 
    dense_out = model.get_layer('Final-Output-Dense')(dec_bn2) 
    decoder_model = Model ([decoder_inputs, gru_inference_state_input], [dense_out, gru_state_out])
    return decoder_model

### **Sequence2Sequence Inference**

In [381]:
class Seq2Seq_Inference(object):
    def __init__(self, encoder_preprocessor, decoder_preprocessor, seq2seq_model):
        self.pp_english = encoder_preprocessor
        self.pp_french = decoder_preprocessor
        self.seq2seq_model = seq2seq_model
        self.encoder_model = extract_encoder_model(seq2seq_model)
        self.decoder_model = extract_decoder_model(seq2seq_model)
        self.default_max_len_french = len(self.pp_french.word_index)+1
        self.nn = None
        self.rec_df = None

    def generate_french(self, raw_input_text, max_len_french=None):
        if max_len_french is None:
            max_len_french = self.default_max_len_french
        raw_tokenized = self.pp_english.texts_to_sequences(raw_input_text)
        temp = pad_sequences(raw_tokenized,padding='post',maxlen=6)
        body_encoding = self.encoder_model.predict(temp)
    
        body_encoding = body_encoding[0]
        
        print(body_encoding.shape)
        state_value = np.array(self.pp_french.texts_to_sequences(['<start>'])).reshape(1, 1)
        print(state_value.shape)
        decoded_sentence = []
        stop_condition = False

        while not stop_condition:
            preds, st = self.decoder_model.predict([state_value, body_encoding])
            pred_idx = np.argmax(preds[:, :, 2:]) + 2
            pred_word_str = self.pp_french.sequences_to_texts[pred_idx]
            if pred_word_str == '<end>' or len(decoded_sentence) >= max_len_french:
                stop_condition = True
                break

            decoded_sentence.append(pred_word_str)
            # Update the decoder for the next word
            body_encoding = st
            state_value = np.array(pred_idx).reshape(1, 1)

        return original_body_encoding, ' '.join(decoded_sentence)

    def print_example(self, i, english_text, french_text=None, threshold=None):
        if i:
            print('\n\n====================================')
            print(f'==============Example # {i}==============')

        print(f"English Text:\n{english_text} \n")

        if french_text:
            print(f"Original French Text:\n{french_text}")

        emb, gen_french = self.generate_french(english_text)
        print(f"\n**********Generated French Text***********:\n{gen_french}")
        
        
        if self.nn:
            # return neighbors and distances
            n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4, include_distances=True)
            neighbors =n[1:]
            dist = d[1:]
            if min(dist) <= threshold:
                cols = ['French', 'English']
                dfcopy = self.rec_df.iloc[neighbors ] [cols].copy(deep=True)
                dfcopy ['dist'] = dist
                similar_issues_df = dfcopy.query(f'dist <= {threshold}')
                display(similar_issues_df)
                
    def demo_model_predictions(self,n,df,threshold=None):
        # Extract body and title from DF
        english_text_list = df['English words/sentences'].tolist() 
        french_text_list = df['French words/sentences'].tolist()
        demo_list = np.random.randint(low=1, high=len(english_text_list), size=n)
        for i in demo_list:
            self.print_example(i,english_text=english_text_list[i], french_text=french_text_list[i], threshold=threshold)

In [382]:
seq2seq_inf = Seq2Seq_Inference (encoder_preprocessor=eng_token, decoder_preprocessor=fre_token, seq2seq_model=seq2seq_Model)
# this method displays the predictions on random rows of the holdout set

In [383]:
# this method displays the predictions on random rows of the holdout set
seq2seq_inf.demo_model_predictions(n=50, df=test, threshold=0.3)




English Text:
I know what they are. 

Original French Text:
Je sais ce qu'elles sont.
(256,)
(1, 1)


ValueError: Data cardinality is ambiguous:
  x sizes: 1, 256
Make sure all arrays contain the same number of samples.