In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json

In [2]:
# Load the pre-trained model
model = tf.keras.models.load_model('fine_tuned_sanskrit_hindi_model3.keras')

In [3]:
# Load Sanskrit tokenizer from JSON file
with open('fine_tuned_sanskrit_tokenizer3.json', 'r', encoding='utf-8') as f:
    sanskrit_tokenizer_json = f.read()  # Read as string
    sanskrit_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(sanskrit_tokenizer_json)

# Load Hindi tokenizer from JSON file
with open('fine_tuned_hindi_tokenizer3.json', 'r', encoding='utf-8') as f:
    hindi_tokenizer_json = f.read()  # Read as string
    hindi_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(hindi_tokenizer_json)

In [4]:
# Function to read sentences from a text file
def read_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        sentences = f.readlines()
    # Remove any extra whitespace and return as a list
    return [sentence.strip() for sentence in sentences]

In [5]:
# Load the new data for fine-tuning from text files
sanskrit_file_path = 'sanskrit3.txt'
hindi_file_path = 'hindi3.txt'

In [6]:
# Read sentences from files
new_sanskrit_sentences = read_sentences(sanskrit_file_path)
new_hindi_sentences = read_sentences(hindi_file_path)

In [7]:
# Convert the new data to sequences
new_sanskrit_sequences = sanskrit_tokenizer.texts_to_sequences(new_sanskrit_sentences)
new_hindi_sequences = hindi_tokenizer.texts_to_sequences(new_hindi_sentences)

In [8]:
# Padding sequences to make them uniform in length
max_len_sanskrit = max([len(seq) for seq in new_sanskrit_sequences])
max_len_hindi = max([len(seq) for seq in new_hindi_sequences])

In [9]:
new_sanskrit_padded = pad_sequences(new_sanskrit_sequences, maxlen=max_len_sanskrit, padding='post')
new_hindi_padded = pad_sequences(new_hindi_sequences, maxlen=max_len_hindi, padding='post')

In [10]:
# Prepare Hindi input and target for the decoder (shifted for teacher forcing)
new_hindi_input = new_hindi_padded[:, :-1]  # Remove the last token for input
new_hindi_target = new_hindi_padded[:, 1:]  # Shifted target sequence

In [11]:
# Fine-tuning configuration
learning_rate = 1e-5  # Use a lower learning rate for fine-tuning
epochs = 500
batch_size = 64

In [12]:
# Re-compile the model with a lower learning rate
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [13]:
# Fine-tune the model on the new dataset
history_finetune = model.fit([new_sanskrit_padded, new_hindi_input],
                             np.expand_dims(new_hindi_target, -1),
                             batch_size=batch_size,
                             epochs=epochs,
                             validation_split=0.2)

Epoch 1/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 659ms/step - accuracy: 0.8600 - loss: 3.5911 - val_accuracy: 0.8280 - val_loss: 3.5491
Epoch 2/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 581ms/step - accuracy: 0.8697 - loss: 3.6079 - val_accuracy: 0.8278 - val_loss: 3.5367
Epoch 3/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 582ms/step - accuracy: 0.8646 - loss: 3.5510 - val_accuracy: 0.8276 - val_loss: 3.5254
Epoch 4/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 604ms/step - accuracy: 0.8598 - loss: 3.5495 - val_accuracy: 0.8281 - val_loss: 3.5144
Epoch 5/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 577ms/step - accuracy: 0.8657 - loss: 3.5325 - val_accuracy: 0.8284 - val_loss: 3.5028
Epoch 6/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 565ms/step - accuracy: 0.8685 - loss: 3.4995 - val_accuracy: 0.8284 - val_loss: 3.4922
Epoch 7/500
[1m9/9[0m [32m━━━━

KeyboardInterrupt: 

In [14]:
# Save the fine-tuned model
model.save('fine_tuned_sanskrit_hindi_model3.keras')

In [15]:
# Save the new tokenizers
sanskrit_tokenizer_json = sanskrit_tokenizer.to_json()
hindi_tokenizer_json = hindi_tokenizer.to_json()

# Write tokenizers to files
with open('fine_tuned_sanskrit_tokenizer3.json', 'w', encoding='utf-8') as f:
    f.write(sanskrit_tokenizer_json)

with open('fine_tuned_hindi_tokenizer3.json', 'w', encoding='utf-8') as f:
    f.write(hindi_tokenizer_json)