In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import string
import json
import re

In [2]:
df = pd.read_csv('data/english_french.csv',index_col = 0)

In [3]:
df.head()

Unnamed: 0,English,French
0,We do not have any choice.,Nous n'avons pas le choix.
1,I like that it is soft.,J'aime que ce soit doux.
2,Was there an earthquake?,Y a-t-il eu un tremblement de terre ?
3,They say he is sick.,Elles disent qu'il est malade.
4,You should always wear a seat belt when you ar...,On devrait toujours mettre une ceinture lorsqu...


In [4]:
df.isnull().any()

English    False
French     False
dtype: bool

In [5]:
text_pairs =list(zip(df['English'],df['French']))
random.seed(42)
random.shuffle(text_pairs)
print(text_pairs[:5])

[('I did not want to give up.', 'Je ne voulais pas abandonner.'), ('What is happening?', "Qu'est-ce qui se passe\u202f?"), ('Protesters tried to disrupt the meeting.', 'Les protestataires tentèrent de perturber la réunion.'), ('You could not have picked a better spot.', "Vous n'auriez pas pu choisir un meilleur endroit."), ('That looks like it hurts.', 'On dirait que ça fait mal.')]


In [6]:
num_val_samples = int(0.1 * len(text_pairs))
num_train_samples = len(text_pairs) -  2*num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples+num_val_samples]
test_pairs = text_pairs[num_train_samples+num_val_samples:]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f'{len(test_pairs)} test pairs')

108742 total pairs
86994 training pairs
10874 validation pairs
10874 test pairs


In [7]:
list1 = list(df['English'])
list1 = [word.strip().strip('!').strip('.').strip(',').strip('?').lower() for sentence in list1 for word in sentence.split(' ') if len(word)]

print('approx english vocab:',len(set(list1)))

list2 = list(df['French'])
list2 = [word.strip('!').strip('.').strip(',').strip('?') for sentence in list2 for word in sentence.split(' ')]
print('approx french vocab:',len(set(list2)))

approx english vocab: 16443
approx french vocab: 29092


In [8]:
for i in range(10,20):
    print(list1[i*10:i*10+10])

['a', 'relatively', 'safe', 'neighborhood', 'tom', 'and', 'mary', 'were', 'on', 'the']
['same', 'wavelength', 'kripananda', 'this', 'place', 'is', 'boring', 'what', 'is', 'your']
['favorite', 'toothpaste', 'qualitised', 'postulated', 'complement', 'why', 'do', 'not', 'you', 'tell']
['me', 'what', 'you', 'want', 'to', 'hear', 'someone', 'cleaned', 'my', 'room']
['while', 'i', 'was', 'gone', 'he', 'lived', 'alone', 'in', 'the', 'forest']
['i', 'feel', 'bad', 'that', 'i', 'have', 'not', 'paid', 'you', 'yet']
['are', 'you', 'already', 'married', 'i', 'am', 'going', 'to', 'drive', 'myself']
['it', 'has', 'a', 'leak', 'whether', 'shakespeare', 'wrote', 'this', 'poem', 'or']
['not', 'will', 'probably', 'remain', 'a', 'mystery', 'you', 'are', 'safe', 'here']
['i', 'prefer', 'to', 'travel', 'alone', 'classifies', 'we', 'appreciate', 'your', 'kind']


In [9]:
for i in range(10,20):
    print(list2[i*10:i*10+10])

['vous', 'demander', 'une', 'dernière', 'faveur', 'Je', 'vis', 'dans', 'un', 'quartier']
['relativement', 'sûr', 'Tom', 'et', 'Marie', 'étaient', 'sur', 'la', 'même', 'longueur']
['d’onde', 'Kripananda', 'Cet', 'endroit', 'est', 'ennuyeux', 'Quel', 'est', 'votre', 'dentifrice']
['préféré\u202f', 'qualifié', 'postulé', 'complément', 'Pourquoi', 'ne', 'me', 'dites-vous', 'pas', 'ce']
['que', 'vous', 'voulez', 'entendre', '', "Quelqu'un", 'nettoya', 'ma', 'chambre', 'pendant']
['que', "j'étais", 'parti', 'Il', 'a', 'vécu', 'seul', 'dans', 'la', 'forêt']
['Je', 'me', 'sens', 'mal', 'de', 'ne', 'pas', 'encore', "t'avoir", 'payé']
['Êtes-vous', 'déjà', 'mariée', '', 'Je', 'vais', 'conduire', 'par', 'mes', 'propres']
['moyens', 'Elle', 'fuit', 'Si', 'Shakespeare', 'a', 'écrit', 'ce', 'poème', 'ou']
['pas', 'restera', 'probablement', 'une', 'énigme', 'Vous', 'êtes', 'en', 'sécurité', 'ici']


In [10]:
def tf_lower_and_split_punct(text):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, "[^ a-z.?!,¿]", "")
    text = tf.strings.regex_replace(text, "[.?!,¿]", r" \0 ")
    text = tf.strings.strip(text)
    text = tf.strings.join(["[SOS]", text, "[EOS]"], separator=" ")
    return text

In [11]:
#some extra characters in french that are not in english
my_extra_list = ['À', 'Â', 'Ç', 'É', 'Ê', 'Ô', 'à', 'á', 'â', 'ç', 'è', 'é', 'ê', 'ë', 'î', 'ï', 'ô', 'ù', 'û', 'ü', 'ō', 'œ']
my_extra_list = [i.lower() for i in my_extra_list]
my_extra_list = list(set(my_extra_list))

my_extra_string = ''.join(my_extra_list)
print(my_extra_string.lower())

üáîōœôëéçïùàûâèê


In [12]:
def tf_lower_and_split_punct_2(text):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, "[^ a-zçèœêâôïûùüàáōëîé.?!,¿]", "")
    text = tf.strings.regex_replace(text, "[.?!,¿]", r" \0 ")
    text = tf.strings.strip(text)
    text = tf.strings.join(["[SOS]", text, "[EOS]"], separator=" ")
    return text

In [13]:
vocab_size = 15000
sequence_length = 20
batch_size = 64

In [14]:
# vectorization
english_vectorization = tf.keras.layers.TextVectorization(
    max_tokens = vocab_size,
    output_mode = "int",
    output_sequence_length = sequence_length,
    standardize=tf_lower_and_split_punct
)

french_vectorization = tf.keras.layers.TextVectorization(
    max_tokens = vocab_size,
    output_mode = "int",
    output_sequence_length = sequence_length+1,
    standardize=tf_lower_and_split_punct_2
)

train_eng_texts = [pair[0] for pair in train_pairs]
train_fre_texts = [pair[1] for pair in train_pairs]

english_vectorization.adapt(train_eng_texts)
french_vectorization.adapt(train_fre_texts)

In [15]:
#save the vectorization layers

english_vocab = english_vectorization.get_vocabulary()
    
with open('text_vectorization_files/english_vocab_for_eng_fre.json', 'w', encoding='utf-8') as f:
    json.dump(english_vocab, f)
    
french_vocab = french_vectorization.get_vocabulary()
    
with open('text_vectorization_files/french_vocab_for_eng_fre.json', 'w', encoding='utf-8') as f:
    json.dump(french_vocab, f)

In [16]:
print(len(english_vocab))
print(english_vocab[:20])
print(french_vocab[:20])

15000
['', '[UNK]', '[SOS]', '[EOS]', '.', 'i', 'you', 'to', 'the', 'is', '?', 'not', 'a', 'do', 'that', 'are', 'it', 'have', 'tom', 'he']
['', '[UNK]', '[SOS]', '[EOS]', '.', 'je', 'de', 'pas', '?', 'que', 'ne', 'à', 'la', 'le', 'vous', 'il', 'tom', 'est', ',', 'a']


In [17]:
list1 = [x for x in english_vocab if len(x)==10]
len(list1)

5606

In [18]:
def format_dataset(eng, spa):
    eng = english_vectorization(eng)
    fre = french_vectorization(spa)
    return (
        {
            "encoder_inputs": eng,
            "decoder_inputs": fre[:, :-1],
        },
        fre[:, 1:],
    )
    
def make_dataset(pairs):
    eng_texts, fre_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    fre_texts = list(fre_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, fre_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.cache().shuffle(2048).prefetch(16)

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)
test_ds = make_dataset(test_pairs)

In [20]:
for inp,tar in train_ds.take(1):
    print(inp['encoder_inputs'][:2])
    print(inp['decoder_inputs'][:2])
    print(tar[:2])

tf.Tensor(
[[   2   35  277   12 3328 1189    4    3    0    0    0    0    0    0
     0    0    0    0    0    0]
 [   2   69   54  552 1218    7  342   25    5   75   34  123  121  165
   187  159    4    3    0    0]], shape=(2, 20), dtype=int64)
tf.Tensor(
[[   2   34   19  314   26 4356 4333    4    3    0    0    0    0    0
     0    0    0    0    0    0]
 [   2   49   31   42  338  793   13  517   18    5  201   66    6  649
    11   12  103  185  210    4]], shape=(2, 20), dtype=int64)
tf.Tensor(
[[  34   19  314   26 4356 4333    4    3    0    0    0    0    0    0
     0    0    0    0    0    0]
 [  49   31   42  338  793   13  517   18    5  201   66    6  649   11
    12  103  185  210    4    3]], shape=(2, 20), dtype=int64)


In [21]:
vocab_size = 15000
units_1 = 128

In [22]:
class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self,embed_dim,dense_dim,num_heads,**kwargs):
        super().__init__(**kwargs)

        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim

        self.mha = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim = embed_dim
        )
        self.dense_proj = tf.keras.Sequential(
            [
                tf.keras.layers.Dense(dense_dim,activation = 'relu'),
                tf.keras.layers.Dense(embed_dim)
            ]
        )
        self.layernorm_1 = tf.keras.layers.LayerNormalization()
        self.layernorm_2 = tf.keras.layers.LayerNormalization()
        self.supports_masking = True

    def call(self,inputs,mask = None):
        if mask is not None:
            padding_mask = tf.cast(mask[:,None,:],dtype = tf.int32)
        else:
            padding_mask = None
        attention_output = self.mha(
            query = inputs,
            value = inputs,
            key = inputs,
            attention_mask = padding_mask
        )
        proj_input = self.layernorm_1(attention_output+inputs)
        proj_output = self.dense_proj(proj_input)
        output = self.layernorm_2(proj_input + proj_output)
        return output
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim':self.embed_dim,
            'dense_dim':self.dense_dim,
            'num_heads': self.num_heads,
        })
        return config

In [23]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self,sequence_length,vocab_size,embed_dim,**kwargs):
        super().__init__(**kwargs)

        self.embed_dim = embed_dim
        self.vocab_size = vocab_size
        self.sequence_length = sequence_length
        
        self.token_embeddings = tf.keras.layers.Embedding(
            input_dim = vocab_size,
            output_dim  = embed_dim
        )

        self.position_embeddings = tf.keras.layers.Embedding(
            input_dim = sequence_length,
            output_dim = embed_dim
        )

    def call(self,inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start = 0,limit = length,delta =1)
        embeded_tokens = self.token_embeddings(inputs)
        embeded_position = self.position_embeddings(positions)
        return embeded_tokens + embeded_position
    
    def compute_mask(self,inputs,mask =None):
        if mask is not None:
            return tf.not_equal(inputs,0)
        else:
            return None
    def get_config(self):
        config = super().get_config()
        config.update({
            'vocab_size': self.vocab_size,
            'sequence_length': self.sequence_length,
            'embed_dim': self.embed_dim,
        })
        return config

In [24]:
class TransformerDecoder(tf.keras.layers.Layer):
    def __init__(self,embed_dim,latent_dim,num_heads,**kwargs):
        super().__init__(**kwargs)

        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads

        self.mha1 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads,key_dim=embed_dim)
        self.mha2 = tf.keras.layers.MultiHeadAttention(num_heads = num_heads,key_dim=embed_dim)

        self.dense_proj = tf.keras.Sequential(
            [
                tf.keras.layers.Dense(latent_dim,activation = 'relu'),
                tf.keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = tf.keras.layers.LayerNormalization()
        self.layernorm_2 = tf.keras.layers.LayerNormalization()
        self.layernorm_3 = tf.keras.layers.LayerNormalization()
        self.supports_masking = True

    def call(self,inputs,encoder_outputs,mask=None):
        casual_mask = self.get_casual_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:,None,:],dtype = tf.int32)
            padding_mask = tf.minimum(padding_mask,casual_mask)
        else:
            padding_mask = None

        attention_output1 = self.mha1(
            query = inputs,
            value = inputs,
            key = inputs,
            attention_mask = casual_mask
        )

        out_1 = self.layernorm_1(inputs + attention_output1)

        attention_output2 = self.mha2(
            query = out_1,
            value = encoder_outputs,
            key = encoder_outputs,
            attention_mask = padding_mask,
        )

        out_2 = self.layernorm_2(out_1 + attention_output2)
        proj_output = self.dense_proj(out_2)
        output = self.layernorm_3(proj_output + out_2)

        return output
    
    def get_casual_attention_mask(self,inputs):
        input_shape = tf.shape(inputs)
        batch_size,sequence_length = input_shape[0],input_shape[1]
        i = tf.range(sequence_length)[:,None]
        j = tf.range(sequence_length)
        mask = tf.cast(i>=j,tf.int32)
        mask = tf.reshape(mask,(1,input_shape[1],input_shape[1]))
        mult = tf.concat(
            [
                tf.expand_dims(batch_size,-1),
                tf.convert_to_tensor([1,1]),
            ],
            axis =0,
        )
        return tf.tile(mask,mult)
    

    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim': self.embed_dim,
            'latent_dim': self.latent_dim,
            'num_heads': self.num_heads
        })
        return config


In [29]:
# define emmbedding dimensions, latent dimensions, and number of heads
embed_dim = 100
latent_dim = 256
num_heads = 2

#Encoder
encoder_inputs = tf.keras.Input(shape = (None,), dtype = "int64", name = "encoder_inputs")

x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)

encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)

encoder = tf.keras.Model(encoder_inputs, encoder_outputs, name = "encoder")

#Decoder
decoder_inputs = tf.keras.Input(shape = (None,), dtype = "int64", name = "decoder_inputs")
encoder_seq_inputs = tf.keras.Input(shape = (None, embed_dim), name = "encoder_seq_inputs")

x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)

x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoder_seq_inputs)

x = tf.keras.layers.Dropout(0.5)(x)

decoder_outputs = tf.keras.layers.Dense(vocab_size, activation = tf.nn.log_softmax)(x)

decoder = tf.keras.Model([decoder_inputs, encoder_seq_inputs], decoder_outputs, name = "decoder")

# Define the final model
decoder_outputs = decoder([decoder_inputs, encoder_outputs])

transformer = tf.keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name = "transformer"
)


In [30]:
logits = transformer((inp['encoder_inputs'],inp['decoder_inputs']))
logits.shape

TensorShape([64, 20, 15000])

In [31]:
epochs = 1
losses = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
transformer.summary()

transformer.compile(
    "adam", loss = losses, metrics = ["accuracy"]
)
# transformer.fit(train_ds, epochs = epochs, validation_data = val_ds)


In [32]:
transformer.load_weights('model_weights/english_french_model.weights.h5')

  saveable.load_own_variables(weights_store.get(inner_path))


In [41]:
my_df = pd.read_csv('data/english_french_10_letter.csv',index_col=0)
my_df.head()

Unnamed: 0,English,French
0,receptions,réceptions
1,recessions,récessions
2,vindicated unoccupied,justifié inoccupé
3,diagonally experionce,en diagonale expérience
4,airrelated,lié à l'air


In [42]:
test_pairs_1= list(zip(my_df['English'],my_df['French']))
test_pairs_1[:5]

[('receptions', 'réceptions'),
 ('recessions', 'récessions'),
 ('vindicated unoccupied', 'justifié inoccupé'),
 ('diagonally experionce', 'en diagonale expérience'),
 ('airrelated', "lié à l'air")]

In [43]:
french_vocab = french_vectorization.get_vocabulary()
french_index_lookup = dict(zip(range(len(french_vocab)), french_vocab))
max_decoded_sentence_length = sequence_length

def decode_sentence(input_sentence):
    tokenized_input_sentence = english_vectorization([input_sentence])
    decoded_sentence = "[SOS]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = french_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = tf.argmax(predictions[0, i, :]).numpy().item(0)
        sampled_token = french_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[EOS]":
            break
    return decoded_sentence

my_list1 = []
for _ in range(5):
    input_sentence,output_sentence = random.choice(test_pairs_1)
    my_list1.append(input_sentence)
    input_sentence = input_sentence.lower()
    translated = decode_sentence(input_sentence)
    print(f"input: {input_sentence}")
    print(f'correct_translation: {output_sentence}')
    print(f"translated: {translated}")
    print()

input: autonomous
correct_translation: autonome
translated: [SOS] autonome [EOS]

input: arunachala
correct_translation: arunachala
translated: [SOS] arunachala [EOS]

input: authorship
correct_translation: paternité
translated: [SOS] paternité [EOS]

input: aspiration
correct_translation: aspiration
translated: [SOS] aspiration [EOS]

input: scavengers
correct_translation: charognards
translated: [SOS] charognards [EOS]



In [38]:
#some elembents
my_list = ['collection deflection remissions articulate terrrorist catwalking celebrated',
           'bharatendu mujahideen mobilizing interrupts transpires classmates jahawarlal',
            'evangelist habituated ascendancy currencies enactments contracted', 
            'switchover vaisheshik assistance aparbrahma undetected applicants', 
            'mistresses suchindram recuperate prosecuted deforested glorifying', 
            'inoculated pleasantly thereafter revengeful humiliated vibheeshan parushuram kurukhatra',
            'portuguese electicity shrotyagya',]

In [2]:
' '.join(my_list)

'collection deflection remissions articulate terrrorist catwalking celebrated bharatendu mujahideen mobilizing interrupts transpires classmates jahawarlal evangelist habituated ascendancy currencies enactments contracted switchover vaisheshik assistance aparbrahma undetected applicants mistresses suchindram recuperate prosecuted deforested glorifying inoculated pleasantly thereafter revengeful humiliated vibheeshan parushuram kurukhatra dharmshala pakisthaan swapurusha portuguese electicity shrotyagya'