<a href="https://colab.research.google.com/github/Priya9112/Null-Class-Data-Science-Internship-/blob/main/Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#TASK-2
###Implement beam search decoding for an NMT model to improve translation quality.

In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json

model = load_model('/content/drive/MyDrive/english_to_french_model')

with open('/content/drive/MyDrive/Json/english_tokenizer.json') as f:
    english_tokenizer_json = json.load(f)
english_tokenizer = tokenizer_from_json(english_tokenizer_json)

with open('/content/drive/MyDrive/Json/french_tokenizer.json') as f:
    french_tokenizer_json = json.load(f)
french_tokenizer = tokenizer_from_json(french_tokenizer_json)

with open('/content/drive/MyDrive/Json/sequence_length.json') as f:
    max_french_sequence_length = json.load(f)

print(f"French tokenizer vocabulary size: {len(french_tokenizer.word_index)}")

def logits_to_text(logits, tokenizer):
    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'  # Add padding token
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

def translate_greedy(input_sentence):
    input_sequence = english_tokenizer.texts_to_sequences([input_sentence])
    input_sequence = pad_sequences(input_sequence, maxlen=max_french_sequence_length)
    translation = model.predict(input_sequence)
    translation_text = logits_to_text(translation[0], french_tokenizer)
    translation_text = ' '.join([word for word in translation_text.split() if word != '<PAD>'])
    return translation_text

def beam_search_decode(model, input_sequence, beam_width=10, max_seq_length=20, early_stopping_threshold=-10.0):
    start_token = french_tokenizer.word_index.get('<start>', 0)
    end_token = french_tokenizer.word_index.get('<end>', 0)
    sequences = [[[start_token], 0.0]]

    for step in range(max_seq_length):
        all_candidates = list()
        for seq, score in sequences:
            if seq[-1] == end_token:
                all_candidates.append((seq, score))
            else:
                target_seq = pad_sequences([seq], maxlen=max_seq_length, padding='post')
                predictions = model.predict([input_sequence, target_seq])
                top_k_indices = np.argsort(predictions[0, -1, :])[-beam_width:]
                for idx in top_k_indices:
                    candidate = [seq + [idx], score + np.log(predictions[0, -1, idx])]
                    all_candidates.append(candidate)
                print(f"Step: {step}, Seq: {seq}, Top indices: {top_k_indices}")
        ordered = sorted(all_candidates, key=lambda tup: tup[1], reverse=True)
        sequences = ordered[:beam_width]
        if sequences[0][1] < early_stopping_threshold:
            break
    return sequences[0][0]


def sequence_to_text(sequence, tokenizer):
    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    return ' '.join([index_to_words.get(idx, '<unk>') for idx in sequence if idx != 0])  # Exclude padding tokens

# acc to data we trained our rnn model on
input_sentences = [
    'new jersey is sometimes quiet during autumn, and it is snowy in april.',
    'the united states is usually chilly during july, and it is usually freezing in november.',
    'california is usually quiet during march, and it is usually hot in june.',
    'the united states is sometimes mild during june, and it is cold in september.'
]

for input_sentence in input_sentences:
    print(f"Input Sentence: {input_sentence}")
    input_sequence = english_tokenizer.texts_to_sequences([input_sentence])
    input_sequence = pad_sequences(input_sequence, maxlen=max_french_sequence_length)

    # I'm Translate using greedy decoding
    translation_greedy = translate_greedy(input_sentence)
    print('Greedy Translation:', translation_greedy)

    # I'm Translate using beam search decoding
    beam_width = 10
    decoded_sequence = beam_search_decode(model, input_sequence, beam_width, max_french_sequence_length)
    translation_beam_search = sequence_to_text(decoded_sequence, french_tokenizer)
    print('Beam Search Translation:', translation_beam_search)
    print()


French tokenizer vocabulary size: 344
Input Sentence: new jersey is sometimes quiet during autumn, and it is snowy in april.
Greedy Translation: new jersey est parfois calme pendant l' automne et il est neigeux en
Beam Search Translation: new l' est neigeux pendant et jersey parfois il est automne calme en

Input Sentence: the united states is usually chilly during july, and it is usually freezing in november.
Greedy Translation: les états unis est généralement froid en juillet et il gèle habituellement en novembre
Beam Search Translation: habituellement en et gèle généralement en les est unis novembre juillet états froid il

Input Sentence: california is usually quiet during march, and it is usually hot in june.
Greedy Translation: est généralement calme en mars et il est généralement chaud en juin
Beam Search Translation: juin généralement chaud est il mars généralement en calme en est et

Input Sentence: the united states is sometimes mild during june, and it is cold in september.
G