In [2]:
import pandas as pd
import numpy as np

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential,Model
from keras.layers import GRU,LSTM, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Embedding,Dropout
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split


In [4]:
df = pd.read_csv('eng_-french.csv')
df.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [5]:
eng = df['English words/sentences']
frn = df['French words/sentences']
print(eng.shape)
print(frn.shape)

(175621,)
(175621,)


In [6]:
def tokenize(data):
    tokenizer = Tokenizer()
    # Replace commas with spaces in each sentence
    for i in range(len(data)):
        data[i] = data[i].replace(',', ' ')
    tokenizer.fit_on_texts(data)
    tokenned_data = tokenizer.texts_to_sequences(data)  # (numbers assigned to each unique word)/tokens

    return tokenned_data, tokenizer


In [7]:
def pad(tokenned_data):
  maxlen=-1
  for sentence in tokenned_data:#sentence or tokens/encoded words
    if(maxlen<len(sentence)):
      maxlen=len(sentence)
  print(maxlen)
  pad_data=pad_sequences(tokenned_data,maxlen=55,padding='post',value=0.0)

  return pad_data

In [8]:
def preprocess(eng,frn):
  tokenned_x, x_tk= tokenize(eng)
  tokenned_y, y_tk= tokenize(frn)

  pad_x = pad(tokenned_x)
  pad_y = pad(tokenned_y)

  # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
  pad_y = pad_y.reshape(*pad_y.shape, 1)

  return pad_x , pad_y , x_tk , y_tk#x_vocab is a directory

In [9]:
pad_x,pad_y,tokenizer_eng,tokenizer_frn = preprocess(eng,frn)

44
55


In [10]:
max_eng_sequence_length = pad_x.shape[1]
max_frn_sequence_length = pad_y.shape[1]

english_vocab_size = len(tokenizer_eng.word_index)+1
french_vocab_size = len(tokenizer_frn.word_index)+1

print("Max English sentence length:", max_eng_sequence_length)
print("Max French sentence length:", max_frn_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Max English sentence length: 55
Max French sentence length: 55
English vocabulary size: 14532
French vocabulary size: 30661


Model

In [11]:
# Define input shape and output sequence length
input_shape = pad_x.shape[1:]
output_sequence_length = pad_y.shape[1]

# Define the model
model = Sequential()
model.add(Embedding(input_dim=english_vocab_size, output_dim=256,input_length=55))
model.add(Bidirectional(GRU(256, return_sequences=True)))
model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))

# Compile the model
learning_rate = 0.003
model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(learning_rate),
              metrics=['accuracy'])

# Print the model summary
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 55, 256)           3720192   
                                                                 
 bidirectional (Bidirection  (None, 55, 512)           789504    
 al)                                                             
                                                                 
 time_distributed (TimeDist  (None, 55, 30661)         15729093  
 ributed)                                                        
                                                                 
Total params: 20238789 (77.20 MB)
Trainable params: 20238789 (77.20 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
print(pad_x.shape)
print(pad_y.shape)

(175621, 55)
(175621, 55, 1)


In [None]:
# Train the model
batch_size = 64
epochs = 1
model.fit(pad_x, pad_y, batch_size=batch_size, epochs=epochs, validation_split=0.2)



In [None]:
import pickle

# Save the tokenizers
with open('tokenizer_eng.pkl', 'wb') as f:
    pickle.dump(tokenizer_eng, f)
with open('tokenizer_frn.pkl', 'wb') as f:
    pickle.dump(tokenizer_frn, f)

In [None]:
# Save the model in native Keras format
model.save('my_model.keras')

In [None]:
from tensorflow.keras.models import load_model
loaded_model = load_model('translation_model.h5')

In [None]:
from tensorflow.keras.models import load_model

# Load the saved model
loaded_model = load_model('translation_model.h5')

# Load the tokenizer (make sure you replace 'tokenizer_eng.pkl' and 'tokenizer_frn.pkl' with the actual filenames)
import pickle
with open('tokenizer_eng.pkl', 'rb') as f:
    tokenizer_eng = pickle.load(f)
with open('tokenizer_frn.pkl', 'rb') as f:
    tokenizer_frn = pickle.load(f)


In [None]:
def preprocess_input(text, tokenizer):
    text = [text]
    tokenized = tokenizer.texts_to_sequences(text)
    padded = pad_sequences(tokenized, maxlen=max_eng_sequence_length, padding='post')
    return padded


In [None]:
# Define your input text (replace this with your desired input)
custom_input = "my name is sakya"

# Preprocess the input text for translation
preprocessed_input = preprocess_input(custom_input, tokenizer_eng)

# Make predictions using the loaded model
translated_sequence = loaded_model.predict(preprocessed_input)


# Get the index of the word with the highest probability for each step in the sequence
translated_sequence_indices = [np.argmax(seq, axis=-1) for seq in translated_sequence]

# Convert the indices back to text using the French tokenizer
translated_text_list = [tokenizer_frn.index_word.get(idx, '') for idx in translated_sequence_indices[0]]

# Join the individual text segments to form the translated text
translated_text = ' '.join(translated_text_list)

# Print the input and translated text
print("Input:", custom_input)
print("Translated:", translated_text)