In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import json
import re
import random

In [2]:
path ="/kaggle/input/french-tamil-google-translations/french_tamil_0.csv"

In [3]:
df = pd.read_csv(path,index_col = 'Unnamed: 0')

print('length of df: ',len(df))

length of df:  125548


In [4]:
my_list1 = list(df['French'])
my_list1 = [x for y in my_list1 for x in y.split(' ')]
my_list1 = list(set(my_list1))

my_list2 = list(df['Tamil'])
my_list2 = [x for y in my_list2 for x in y.split(' ')]
my_list2 = list(set(my_list2))

print('approx length of french vocabulary: ',len(my_list1))
print('approx size of five_letter_words in fre_vocabulary',len([x for x in my_list1 if len(x)==5]))
print('approx length of Tamil vocabulary: ',len(my_list1))


approx length of french vocabulary:  31609
approx size of five_letter_words in fre_vocabulary 2977
approx length of Tamil vocabulary:  31609


In [5]:
df.head()

Unnamed: 0,French,Tamil
0,Tout le monde n’est pas comme toi.,எல்லோரும் உங்களைப் போல் இல்லை.
1,L'avion volait au-dessus des nuages.,விமானம் மேகங்களுக்கு மேல் பறந்து கொண்டிருந்தது.
2,Reste loin de moi.,என்னை விட்டு விலகி இரு.
3,Où as-tu eu ça ?,இது எங்கிருந்து கிடைத்தது?
4,Je n'avais pas le choix.,எனக்கு வேறு வழியில்லை.


In [6]:
my_list = list(df['Tamil'].apply(lambda x: re.findall('[a-zA-Z]+',x)))
print('rows that contain english alphabets in tamil: ',len([x for x in my_list if len(x)]))
indexes = []
for i,j in enumerate(my_list):
    if len(j):
        # print(j)
        indexes.append(i)
df.drop(df.index[indexes],inplace =True,axis = 0)

#cross checking once again
my_list = list(df['Tamil'].apply(lambda x: re.findall('[a-zA-Z]+',x)))
print('rows that contain english alphabets in tamil after elimination: ',len([x for x in my_list if len(x)]))


rows that contain english alphabets in tamil:  388
rows that contain english alphabets in tamil after elimination:  0


In [7]:
french_list = df['French'].tolist()
tamil_list = df['Tamil'].tolist()
print(french_list[:5])
print(tamil_list[:5])

['Tout le monde n’est pas comme toi.', "L'avion volait au-dessus des nuages.", 'Reste loin de moi.', 'Où as-tu eu ça ?', "Je n'avais pas le choix."]
['எல்லோரும் உங்களைப் போல் இல்லை.', 'விமானம் மேகங்களுக்கு மேல் பறந்து கொண்டிருந்தது.', 'என்னை விட்டு விலகி இரு.', 'இது எங்கிருந்து கிடைத்தது?', 'எனக்கு வேறு வழியில்லை.']


In [8]:
text_pairs =list(zip(french_list,tamil_list))
random.seed(42)
random.shuffle(text_pairs)
text_pairs = text_pairs[:100000]
print(text_pairs[:5])

[('Appelez la police!', 'காவல்துறையை அழைக்கவும்!'), ('Être conscient de ce que nous mangeons et de quelle quantité est essentiel à une bonne santé.', 'நாம் என்ன சாப்பிடுகிறோம், எவ்வளவு சாப்பிடுகிறோம் என்பதை அறிந்திருப்பது நல்ல ஆரோக்கியத்திற்கு அவசியம்.'), ('Elle a enseigné la musique pendant trente ans.', 'முப்பது வருடங்கள் இசை கற்பித்தார்.'), ('Tom a perdu contact avec Mary.', 'டாம் மேரி உடனான தொடர்பை இழந்தார்.'), ('Tom veut que son père soit enterré à côté de sa mère.', 'டாம் தனது தந்தையை தனது தாயின் அருகில் அடக்கம் செய்ய விரும்புகிறார்.')]


In [9]:

num_val_samples = int(0.1 * len(text_pairs))
num_train_samples = len(text_pairs) -  2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples+num_val_samples:]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f'{len(test_pairs)} test pairs')

100000 total pairs
80000 training pairs
10000 validation pairs
10000 test pairs


In [10]:
vocab_size_1 = 10000
vocab_size_2 = 12000
batch_size = 64

In [11]:
def tf_lower_and_split_punct(text):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, "[^ a-z.?!,¿]", "")
    text = tf.strings.regex_replace(text, "[.?!,¿]", r" \0 ")
    text = tf.strings.strip(text)
    text = tf.strings.join(["[SOS]", text, "[EOS]"], separator=" ")
    return text

In [12]:
def tf_lower_and_split_punct_1(text):
    text = tf.strings.lower(text)
    text = tf.strings.strip(text)
    text = tf.strings.regex_replace(text, "[.?!,¿]", r" \0 ")
    text = tf.strings.join(["[SOS]", text, "[EOS]"], separator=" ")
    return text

In [13]:
# vectorization
fre_vectorization = tf.keras.layers.TextVectorization(
    max_tokens = vocab_size_1,
    output_mode = "int",
    ragged = True,
    standardize=tf_lower_and_split_punct
)

tam_vectorization = tf.keras.layers.TextVectorization(
    max_tokens = vocab_size_2,
    output_mode = "int",
    ragged = True,
    standardize=tf_lower_and_split_punct_1
)

train_fre_texts = [pair[0] for pair in train_pairs]
train_tam_texts = [pair[1] for pair in train_pairs]

fre_vectorization.adapt(train_fre_texts)
tam_vectorization.adapt(train_tam_texts)

#

In [14]:
# #save the vectorization layers
# fre_vectorization_config = fre_vectorization.get_config()
# fre_vectorization_config.pop('standardize', None)
fre_vocab = fre_vectorization.get_vocabulary()
# with open('fre_vectorization_config.json', 'w', encoding='utf-8') as f:
#     json.dump(fre_vectorization_config, f)
    
# with open('fre_vocab.json', 'w', encoding='utf-8') as f:
#     json.dump(fre_vocab, f)
    
# tam_vectorization_config = tam_vectorization.get_config()
# tam_vectorization_config.pop('standardize', None)
tam_vocab = tam_vectorization.get_vocabulary()
# with open('tam_vectorization_config.json', 'w', encoding='utf-8') as f:
#     json.dump(tam_vectorization_config, f)
    
# with open('tam_vocab.json', 'w', encoding='utf-8') as f:
#     json.dump(tam_vocab, f)
    

In [15]:
print(len(fre_vocab))
print(len(tam_vocab))
print(fre_vocab[:20])
print(tam_vocab[:20])
five_letter_words = [x for x in fre_vocab if len(x)==5]
print('size of five letter words in french_vocab: ',len(five_letter_words))

10000
12000
['', '[UNK]', '[SOS]', '[EOS]', '.', 'de', 'je', 'tom', 'pas', 'la', '?', 'le', 'a', 'que', 'ne', 'il', 'est', 'vous', 'un', ',']
['', '[UNK]', '[SOS]', '[EOS]', '.', 'நான்', 'டாம்', '?', 'நீங்கள்', 'ஒரு', 'என்று', 'வேண்டும்', ',', 'அவர்', 'எனக்கு', 'இந்த', 'உங்கள்', 'என்', 'மிகவும்', 'அவள்']
size of five letter words in french_vocab:  1655


In [16]:
Buffer_size = 10000
batch_size = 64
def generate_dataset(pairs):
    fre_data = [x[0] for x in pairs]
    tam_data = [x[1] for x in pairs]
    dataset = tf.data.Dataset.from_tensor_slices((fre_data,tam_data)).shuffle(Buffer_size).batch(batch_size)
    return dataset

In [17]:
train_dataset = generate_dataset(train_pairs)
val_dataset = generate_dataset(val_pairs)
test_dataset = generate_dataset(test_pairs)
print(len(test_dataset))

157


In [18]:
def process(fre_data,tam_data):
    fre_ids = fre_vectorization(fre_data).to_tensor()
    tam_ids = tam_vectorization(tam_data)
    target_in = tam_ids[:,:-1].to_tensor()
    target_out = tam_ids[:,1:].to_tensor()
    return (fre_ids,target_in),target_out

In [19]:
train_dataset = train_dataset.map(process,tf.data.AUTOTUNE)
val_dataset = val_dataset.map(process,tf.data.AUTOTUNE)
test_dataset = test_dataset.map(process,tf.data.AUTOTUNE)

print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

1250
157
157


In [20]:
for (x,y),z in train_dataset.take(1):
    print(x[:2])
    print(y[:2])
    print(z[:2])

tf.Tensor(
[[   2   99  175    3    0    0    0    0    0    0    0    0    0    0
     0    0]
 [   2    9 4684   33   11 6382 1281 1305  184  844    4    3    0    0
     0    0]], shape=(2, 16), dtype=int64)
tf.Tensor(
[[   2 1043  364    0    0    0    0    0    0    0    0]
 [   2    1  934    1    1    1    4    0    0    0    0]], shape=(2, 11), dtype=int64)
tf.Tensor(
[[1043  364    3    0    0    0    0    0    0    0    0]
 [   1  934    1    1    1    4    3    0    0    0    0]], shape=(2, 11), dtype=int64)


In [21]:
word_to_id = tf.keras.layers.StringLookup(
    vocabulary = tam_vocab,
    mask_token = "",
    oov_token = '[UNK]'
)

id_to_word = tf.keras.layers.StringLookup(
    vocabulary = tam_vocab,
    mask_token = '',
    oov_token = '[UNK]',
    invert = True
)

In [22]:
def decode_string(ints):
  strs = [chr(i) for i in ints]
  joined = ''.join(strs)
  return joined

def tokens_to_text(tokens, id_to_word):
    words = id_to_word(tokens)

    try:
       result = tf.strings.reduce_join(words, axis=-1, separator=" ").numpy()
    except:
      result = words.numpy()

    decoded = tf.strings.unicode_decode(result,'utf-8').numpy()
    decoded_sentence = decode_string(decoded)
    return decoded_sentence

In [23]:
sos_id = word_to_id('[SOS]')
eos_id = word_to_id('[EOS]')
print(sos_id)
print(eos_id)



tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)


In [24]:
vocab_size_1 = 10000
vocab_size_2 = 12000
units_1 = 128

In [25]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self,fre_vocab_size = vocab_size_1,units = units_1):
        super(Encoder,self).__init__()
        
        self.vocab_size = fre_vocab_size
        self.units =units

        self.embedding = tf.keras.layers.Embedding(input_dim = fre_vocab_size,output_dim = units,mask_zero=True)
        self.lstm = tf.keras.layers.Bidirectional(merge_mode='sum',layer = tf.keras.layers.LSTM(units,return_sequences= True))

    def call(self,encoder_inputs):

        embedded_output = self.embedding(encoder_inputs)
        output = self.lstm(embedded_output)
        return output
    
    


In [26]:
encoder = Encoder(vocab_size_1,units_1)
output1 = encoder(x)
output1.shape

TensorShape([64, 16, 128])

In [27]:
class CrossAttention(tf.keras.layers.Layer):
    def __init__(self,units=units_1):
        super().__init__()

        self.units =units

        self.mha = (tf.keras.layers.MultiHeadAttention(key_dim= units,num_heads=1))
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()
        self.support_masking = True
    def call(self,context,target):

        attn_output = self.mha(query = target,value = context)
        x = self.add([target,attn_output])
        x = self.layernorm(x)
        return x
    

In [28]:
attention = CrossAttention(units_1)
embed_output = tf.keras.layers.Embedding(vocab_size_2,units_1)(y)
output2  = attention(output1,embed_output)
output2.shape



TensorShape([64, 11, 128])

In [29]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self,fre_vocab_size = vocab_size_1,units = units_1,tam_vocab_size = vocab_size_2):
        super(Decoder,self).__init__()

        self.fre_vocab_size = fre_vocab_size
        self.tam_vocab_size = tam_vocab_size
        self.units = units

        self.embedding = tf.keras.layers.Embedding(input_dim = tam_vocab_size,output_dim = units,mask_zero=True)
        self.pre_attention_rnn = tf.keras.layers.LSTM(units,return_sequences = True,return_state = True)
        self.attention = CrossAttention(units)
        self.post_attention_rnn = tf.keras.layers.LSTM(units = units,return_sequences=True)
        self.dense = tf.keras.layers.Dense(tam_vocab_size,activation = tf.nn.log_softmax)
        self.support_masking = True
    def call(self,context,target,state = None,return_state = False):

        embedding_output = self.embedding(target)
        x,state_h,state_c = self.pre_attention_rnn(embedding_output,initial_state=state)
        x = self.attention(context,x)
        x = self.post_attention_rnn(x)
        logits = self.dense(x)

        if return_state:
            return logits,[state_h,state_c]

        return logits
    

In [30]:
decoder = Decoder(vocab_size_1,units_1)
output3 = decoder(output1,y)
output3.shape



TensorShape([64, 11, 12000])

In [31]:
class Translator(tf.keras.Model):
    def __init__(self,fre_vocab_size =vocab_size_1,units = units_1,tam_vocab_size = vocab_size_2):
        super().__init__()
        self.encoder = Encoder(fre_vocab_size,units)
        self.decoder = Decoder(fre_vocab_size,units,tam_vocab_size)

    def call(self,inputs):
        context,target = inputs
        encoder_output = self.encoder(context)
        logits = self.decoder(encoder_output,target)

        return logits


In [32]:
translator = Translator(vocab_size_1,units_1,vocab_size_2)

In [33]:
def compile_and_train(model,epochs =10,steps_per_epoch = 500):
    model.compile(optimizer = 'adam',loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction = 'none'),
                  metrics = ['accuracy'])

    history  = model.fit(
        train_dataset.repeat(),
        epochs = epochs,
        steps_per_epoch = steps_per_epoch,
        validation_data = val_dataset,
        validation_steps = 50,
        callbacks = [tf.keras.callbacks.EarlyStopping(patience=3)]
    )
    return model, history

In [38]:
#repeat this step 3-4 times i.e for 30-40 epochs
trained_translator,history = compile_and_train(translator,epochs=20)

Epoch 1/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 35ms/step - accuracy: 0.9303 - loss: 0.3032 - val_accuracy: 0.8540 - val_loss: 0.8719
Epoch 2/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 34ms/step - accuracy: 0.9325 - loss: 0.3020 - val_accuracy: 0.8510 - val_loss: 0.8869
Epoch 3/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 34ms/step - accuracy: 0.9353 - loss: 0.2896 - val_accuracy: 0.8466 - val_loss: 0.9329
Epoch 4/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 33ms/step - accuracy: 0.9417 - loss: 0.2575 - val_accuracy: 0.8503 - val_loss: 0.8922


In [39]:
trained_translator.save_weights('french_to_tamil.weights.h5')

In [40]:
def generate_next_token_0(decoder,context,next_token,done,state):
    
    logits,state = decoder(context,next_token,state,return_state = True)

    logits = logits[:,-1,:]

    next_token = tf.argmax(logits,axis = -1)

    logits = tf.squeeze(logits)

    next_token = tf.squeeze(next_token)

    logit = logits[next_token].numpy()
    
    next_token = tf.reshape(next_token,shape  = (1,1))
    
    if next_token == eos_id :
        done = True
    return next_token,state,done,logit

In [73]:
def translate_0(model,text,max_length = 30,):
    tokens,logits = [],[]
    #condition to convert only five letter words
    text = text.split(' ')
    text = [x.strip().strip(',').strip('?').strip('!').strip('"').strip('.') for x in text]
    text = [x for x in text if len(x)==5]
    # print('five _letter_words in input_text: ' ,len(text))
    if len(text) == 0:
        return 'The Text has no five letter words! Please try again'
    text = ' '.join(text)
    text = tf.convert_to_tensor(text)[tf.newaxis]
    context = fre_vectorization(text).to_tensor()
    context = model.encoder(context)
    state = [tf.zeros((1,units_1)),tf.zeros((1,units_1))]
    next_token = tf.fill((1,1),sos_id)
    done  = False

    for i in range(max_length):
        try:
            next_token,state,done,logit = generate_next_token_0(decoder = model.decoder,
                                                                context = context,
                                                                next_token = next_token,
                                                                done = done,
                                                                state = state,
                                                              )
        except:
            raise Exception('generate next token code issue')
        if done:
            break
        tokens.append(next_token)
        logits.append(logit)
    tokens = tf.concat(tokens,axis = -1)
    tokens = tf.squeeze(tokens)
    
    translation =tokens_to_text(tokens,id_to_word)

    return translation,

In [74]:
my_list = ["S'il vous plaît, allez-y.",
           "êtres",
           "Je pense que je deviens fou.",
           "Vous devriez arrêter de fumer car c'est malsain.",
           "autre bûche",
           " Je jouais de la flûte quand j'étais au lycée",
           ' cible quels huilé',
           "belle fille noirs était"]

In [75]:
for i in my_list[:]:
    translation = translate_0(trained_translator,i,)
    print(translation)

('தயவுசெய்து',)
('உயிரினங்கள்',)
('நினைக்கிறார்கள்',)
('இதை புகை',)
('மற்றொரு பதிவு',)
('உயர்நிலைப் பள்ளி [UNK] வந்த உடைந்த இடம்',)
('எண்ணெய் தடவிய இலக்கு யாரென்று தொடர்',)
('அழகான கருப்பு பெண்',)


In [76]:
fre_data = [x[0] for x in train_pairs]
tam_data = [x[1] for x in train_pairs]
print(len(fre_data))


80000


In [77]:
p = 67890
print(fre_data[p])
print(tam_data[p])

Surtout, soyez patient.
எல்லாவற்றிற்கும் மேலாக, பொறுமையாக இருங்கள்.


In [72]:
translate_0(trained_translator,fre_data[p])

('[UNK] சிறந்தது',)

In [49]:
fre_data[p]

"Tom est assez souvent en retard à l'école."

output_shape = 10 * 26 
output = [[0.1, 0.01, 0.03, ... ... ... ... ... (len-26)],
          [0.1, 0.002, 0.6, ... ... ... ... ... (len-26)],
            ''         ''         ''      ''       ''    
            ''         ''         ''      ''       '' 
          [0.9, 0.01, 0.01, ... ... ... ... ... (len-26)]]

m

import numpy as np
def greedy_search_decoder(predictions):
  
    #select token with the maximum probability for each prediction
    output_sequence = [np.argmax(prediction) for prediction in predictions]
    
    #storing token probabilities
    token_probabilities = [np.max(prediction) for prediction in predictions]
    
    #multiply individaul token-level probabilities to get overall sequence probability
    sequence_probability = np.product(token_probabilities)
    
    return output_sequence, sequence_probability
    
model_prediction = [[0.1, 0.7, 0.1, 0.1],
                    [0.7, 0.1, 0.1, 0.1],
                    [0.1, 0.1, 0.6, 0.2],
                    [0.1, 0.1, 0.1, 0.7],
                    [0.4, 0.3, 0.2, 0.1]]
                    
greedy_search_decoder(model_prediction)



import numpy as np
import math

def beam_search_decoder(predictions, top_k = 3):
    #start with an empty sequence with zero score
    output_sequences = [([], 0)]
    
    #looping through all the predictions
    for token_probs in predictions:
        new_sequences = []
        
        #append new tokens to old sequences and re-score
        for old_seq, old_score in output_sequences:
            for char_index in range(len(token_probs)):
                new_seq = old_seq + [char_index]
                #considering log-likelihood for scoring
                new_score = old_score + math.log(token_probs[char_index])
                new_sequences.append((new_seq, new_score))
                
        #sort all new sequences in the de-creasing order of their score
        output_sequences = sorted(new_sequences, key = lambda val: val[1], reverse = True)
        
        #select top-k based on score 
        # *Note- best sequence is with the highest score
        output_sequences = output_sequences[:top_k]
        
    return output_sequences
    

model_prediction = [[0.1, 0.7, 0.1, 0.1],
                    [0.7, 0.1, 0.1, 0.1],
                    [0.1, 0.1, 0.6, 0.2],
                    [0.1, 0.1, 0.1, 0.7],
                    [0.4, 0.3, 0.2, 0.1]]
                    
beam_search_decoder(model_prediction, top_k = 5)



[Out] : [([1, 0, 2, 3, 0], -2.497141187456343),
         ([1, 0, 2, 3, 1], -2.784823259908124),
         ([1, 0, 2, 3, 2], -3.1902883680162883),
         ([1, 0, 3, 3, 0], -3.595753476124453),
         ([1, 0, 2, 3, 3], -3.8834355485762337)]


[Out]: ([1, 0, 2, 3, 0], 0.08231999999999998)

data = [[0.1, 0.2, 0.3, 0.4, 0.5],
 [0.5, 0.4, 0.3, 0.2, 0.1],
 [0.1, 0.2, 0.3, 0.4, 0.5],
 [0.5, 0.4, 0.3, 0.2, 0.1],
 [0.1, 0.2, 0.3, 0.4, 0.5],
 [0.5, 0.4, 0.3, 0.2, 0.1],
 [0.1, 0.2, 0.3, 0.4, 0.5],
 [0.5, 0.4, 0.3, 0.2, 0.1],
 [0.1, 0.2, 0.3, 0.4, 0.5],
 [0.5, 0.4, 0.3, 0.2, 0.1]]
data = array(data)

def greedy_decoder(data):
 # index for largest probability each row
 return [argmax(s) for s in data]

from numpy import argmax
 
# greedy decoder
def greedy_decoder(data):
 # index for largest probability each row
 return [argmax(s) for s in data]
 
# define a sequence of 10 words over a vocab of 5 words
data = [[0.1, 0.2, 0.3, 0.4, 0.5],
 [0.5, 0.4, 0.3, 0.2, 0.1],
 [0.1, 0.2, 0.3, 0.4, 0.5],
 [0.5, 0.4, 0.3, 0.2, 0.1],
 [0.1, 0.2, 0.3, 0.4, 0.5],
 [0.5, 0.4, 0.3, 0.2, 0.1],
 [0.1, 0.2, 0.3, 0.4, 0.5],
 [0.5, 0.4, 0.3, 0.2, 0.1],
 [0.1, 0.2, 0.3, 0.4, 0.5],
 [0.5, 0.4, 0.3, 0.2, 0.1]]
data = array(data)
# decode sequence
result = greedy_decoder(data)

[4, 0, 4, 0, 4, 0, 4, 0, 4, 0]

# beam search
def beam_search_decoder(data, k):
 sequences = [[list(), 0.0]]
 # walk over each step in sequence
 for row in data:
 all_candidates = list()
 # expand each current candidate
 for i in range(len(sequences)):
 seq, score = sequences[i]
 for j in range(len(row)):
 candidate = [seq + [j], score - log(row[j])]
 all_candidates.append(candidate)
 # order all candidates by score
 ordered = sorted(all_candidates, key=lambda tup:tup[1])
 # select k best
 sequences = ordered[:k]
 return sequences

from math import log
from numpy import array
from numpy import argmax
 
# beam search
def beam_search_decoder(data, k):
 sequences = [[list(), 0.0]]
 # walk over each step in sequence
 for row in data:
 all_candidates = list()
 # expand each current candidate
 for i in range(len(sequences)):
 seq, score = sequences[i]
 for j in range(len(row)):
 candidate = [seq + [j], score - log(row[j])]
 all_candidates.append(candidate)
 # order all candidates by score
 ordered = sorted(all_candidates, key=lambda tup:tup[1])
 # select k best
 sequences = ordered[:k]
 return sequences


# define a sequence of 10 words over a vocab of 5 words
data = [[0.1, 0.2, 0.3, 0.4, 0.5],
 [0.5, 0.4, 0.3, 0.2, 0.1],
 [0.1, 0.2, 0.3, 0.4, 0.5],
 [0.5, 0.4, 0.3, 0.2, 0.1],
 [0.1, 0.2, 0.3, 0.4, 0.5],
 [0.5, 0.4, 0.3, 0.2, 0.1],
 [0.1, 0.2, 0.3, 0.4, 0.5],
 [0.5, 0.4, 0.3, 0.2, 0.1],
 [0.1, 0.2, 0.3, 0.4, 0.5],
 [0.5, 0.4, 0.3, 0.2, 0.1]]
data = array(data)
# decode sequence
result = beam_search_decoder(data, 3)
# print result
for seq in result:
 print(seq)


[[4, 0, 4, 0, 4, 0, 4, 0, 4, 0], 6.931471805599453]
[[4, 0, 4, 0, 4, 0, 4, 0, 4, 1], 7.154615356913663]
[[4, 0, 4, 0, 4, 0, 4, 0, 3, 0], 7.154615356913663]