In [14]:
# Cell 3 - train small SentencePiece and tiny seq2seq (fast) - Retry
from pathlib import Path
BASE = Path("/content/bolly_chatbot")
DATA = BASE / "data"
MODELS = BASE / "models"
CORPUS = DATA / "corpus.txt"

# import helpers
import sys
# Ensure the path is only appended if not already present
if str(BASE) not in sys.path:
    sys.path.append(str(BASE))
from utils import parse_srt_to_lines, build_pairs_from_lines

# collect lines and pairs
lines=[]
for p in DATA.glob("*.srt"):
    lines += parse_srt_to_lines(p)
print("Lines found:", len(lines))
pairs = build_pairs_from_lines(lines)
print("Pairs:", len(pairs))
if len(pairs) == 0:
    raise SystemExit("No pairs found. Upload .srt to /content/bolly_chatbot/data/ and rerun.")

# write corpus
with open(CORPUS, "w", encoding="utf-8") as f:
    for a,b in pairs:
        f.write(a + "\n")
        f.write(b + "\n")
print("Wrote corpus:", CORPUS)

# train SentencePiece (small vocab)
import sentencepiece as spm
sp_prefix = str(MODELS / "spm_bolly")
# Ensure SentencePiece model is trained only if it doesn't exist
if not (Path(sp_prefix + ".model")).exists():
    spm.SentencePieceTrainer.Train(f"--input={CORPUS} --model_prefix={sp_prefix} --vocab_size=128 --character_coverage=0.9995 --model_type=bpe --user_defined_symbols=<s>,</s>")
    print("Trained SPM:", sp_prefix + ".model")
else:
    print("SentencePiece model already exists:", sp_prefix + ".model")

# prepare encoded data
sp = spm.SentencePieceProcessor(); sp.Load(sp_prefix + ".model")
from tensorflow.keras.preprocessing.sequence import pad_sequences
def encode(s, maxlen=16):
    txt = "<s> " + s + " </s>"
    # Ensure truncation happens if needed
    encoded_ids = sp.EncodeAsIds(txt)
    return encoded_ids[:maxlen]

max_enc=16; max_dec=15
encs=[]; decins=[]; decouts=[]
for a,b in pairs:
    e = encode(a,max_enc)
    d = encode(b,max_dec+1)
    if len(d) < 2: continue
    encs.append(e)
    decins.append(d[:-1])
    decouts.append(d[1:])

import numpy as np
encs = pad_sequences(encs, maxlen=max_enc, padding='post')
decins = pad_sequences(decins, maxlen=max_dec, padding='post')
decouts = pad_sequences(decouts, maxlen=max_dec, padding='post')
decouts = np.expand_dims(decouts, -1)

# define tiny seq2seq
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Reshape
from tensorflow.keras.models import Model
V = sp.GetPieceSize()
emb_dim=48; units=48

# Encoder
enc_input = Input(shape=(max_enc,), name='enc_input')
enc_emb_layer = Embedding(V, emb_dim, mask_zero=True, name='enc_embedding')
emb_e = enc_emb_layer(enc_input)
enc_lstm = LSTM(units, return_state=True, name='encoder_lstm')
_, sh, sc = enc_lstm(emb_e)

# Decoder
dec_input = Input(shape=(max_dec,), name='dec_input')
dec_emb_layer = Embedding(V, emb_dim, mask_zero=True, name='dec_embedding')
emb_d = dec_emb_layer(dec_input)
# Add Reshape layer to explicitly set shape to (max_dec, emb_dim)
reshaped_emb_d = Reshape((max_dec, emb_dim))(emb_d)
dec_lstm = LSTM(units, return_sequences=True, return_state=True, name='decoder_lstm')
dec_outs, _, _ = dec_lstm(reshaped_emb_d, initial_state=[sh, sc])
dec_dense = Dense(V, activation='softmax', name='decoder_dense')
logits = dec_dense(dec_outs)


model = Model([enc_input, dec_input], logits)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

print("Training tiny model...")
model.fit([encs, decins], decouts, batch_size=4, epochs=4, verbose=2)

# save artifacts
model.save_weights(str(MODELS / "seq2seq_weights.weights.h5"))
# Correct method to save SentencePiece model is not needed as it's saved during training
# The SentencePieceProcessor object does not have a 'Save' method.
# The model is saved during the SentencePieceTrainer.Train call.
print("Saved weights to", MODELS / "seq2seq_weights.weights.h5")
print("SPM vocab size:", sp.GetPieceSize())

Lines found: 3
Pairs: 2
Wrote corpus: /content/bolly_chatbot/data/corpus.txt
SentencePiece model already exists: /content/bolly_chatbot/models/spm_bolly.model
Training tiny model...
Epoch 1/4




1/1 - 4s - 4s/step - loss: 4.8527
Epoch 2/4
1/1 - 0s - 64ms/step - loss: 4.8460
Epoch 3/4
1/1 - 0s - 61ms/step - loss: 4.8392
Epoch 4/4
1/1 - 0s - 62ms/step - loss: 4.8323
Saved weights to /content/bolly_chatbot/models/seq2seq_weights.weights.h5
SPM vocab size: 128


In [24]:
# Cell 4 - Inference
from pathlib import Path
import numpy as np
import sentencepiece as spm
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Reshape # Import Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences

BASE = Path("/content/bolly_chatbot")
MODELS = BASE / "models"

# Load SentencePiece model
sp_prefix = str(MODELS / "spm_bolly")
sp = spm.SentencePieceProcessor()
sp.Load(sp_prefix + ".model")
V = sp.GetPieceSize()
print("Loaded SPM model from:", sp_prefix + ".model")
print("SPM vocab size:", V)

# Model parameters (should match training)
max_enc = 16
max_dec = 15
emb_dim = 48
units = 48

# Recreate the full model structure used during training to load weights
# Encoder
enc_input_full = Input(shape=(max_enc,), name='enc_input')
enc_emb_layer_full = Embedding(V, emb_dim, mask_zero=True, name='enc_embedding')
emb_e_full = enc_emb_layer_full(enc_input_full)
enc_lstm_full = LSTM(units, return_state=True, name='encoder_lstm')
enc_outputs_full, sh_full, sc_full = enc_lstm_full(emb_e_full)

# Decoder
dec_input_full = Input(shape=(max_dec,), name='dec_input')
dec_emb_layer_full = Embedding(V, emb_dim, mask_zero=True, name='dec_embedding')
emb_d_full = dec_emb_layer_full(dec_input_full)
# Add Reshape layer as used in training
reshaped_emb_d_full = Reshape((max_dec, emb_dim))(emb_d_full)
dec_lstm_full = LSTM(units, return_sequences=True, return_state=True, name='decoder_lstm')
dec_outs_full, _, _ = dec_lstm_full(reshaped_emb_d_full, initial_state=[sh_full, sc_full])
dec_dense_full = Dense(V, activation='softmax', name='decoder_dense')
logits_full = dec_dense_full(dec_outs_full)

# Full model for loading weights
full_model = Model([enc_input_full, dec_input_full], logits_full)

# Load trained weights into the full model
full_model.load_weights(str(MODELS / "seq2seq_weights.weights.h5"))
print("Loaded model weights into full model from:", MODELS / "seq2seq_weights.weights.h5")


# Now define the encoder model for inference
enc_input = Input(shape=(max_enc,), name='enc_input')
enc_emb_layer = full_model.get_layer('enc_embedding')
emb_e = enc_emb_layer(enc_input)
enc_lstm = full_model.get_layer('encoder_lstm')
enc_outputs, state_h, state_c = enc_lstm(emb_e)
encoder_model = Model(enc_input, [state_h, state_c])
print("Defined encoder model for inference")

# Define the decoder model for inference
dec_input = Input(shape=(1,), name='dec_input') # Decoder input is one token at a time
dec_state_h = Input(shape=(units,), name='dec_state_h_input')
dec_state_c = Input(shape=(units,), name='dec_state_c_input')
dec_states_inputs = [dec_state_h, dec_state_c]

dec_emb_layer = full_model.get_layer('dec_embedding') # Reuse embedding layer
emb_d = dec_emb_layer(dec_input)

dec_lstm = full_model.get_layer('decoder_lstm') # Reuse LSTM layer
dec_outputs, state_h_out, state_c_out = dec_lstm(emb_d, initial_state=dec_states_inputs)
dec_states_outputs = [state_h_out, state_c_out]

dec_dense = full_model.get_layer('decoder_dense') # Reuse dense layer
output_tokens = dec_dense(dec_outputs)

decoder_model = Model([dec_input] + dec_states_inputs, [output_tokens] + dec_states_outputs)
print("Defined decoder model for inference")


# Define the inference function
def generate_response(input_sentence, max_length=max_dec):
    # Preprocess the input sentence
    input_seq = sp.EncodeAsIds("<s> " + input_sentence + " </s>")
    input_seq = pad_sequences([input_seq], maxlen=max_enc, padding='post')

    # Get the initial states from the encoder
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Start the decoder with the start token
    start_token = sp.PieceToId("<s>")
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token

    # Sampling loop for a batch of sequences
    decoded_sentence = []
    for _ in range(max_length):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)

        # Sample a token (greedy approach for simplicity)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Add a check to ensure the sampled index is within the valid range
        if sampled_token_index < 0 or sampled_token_index >= V:
            print(f"Warning: Sampled token index {sampled_token_index} is out of vocabulary range [0, {V-1}]. Stopping generation.")
            break

        # Explicitly cast to int just in case
        sampled_token = sp.IdToPiece(int(sampled_token_index))

        # Exit condition: hitting stop character or max length
        if sampled_token == "</s>":
            break

        # Append token
        decoded_sentence.append(sampled_token)

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    # Join the tokens to form the sentence, handling SentencePiece subwords
    return "".join(decoded_sentence).replace(" ", " ").strip()

# Test the inference function
print("\nTesting inference:")
sample_inputs = ["क्या तुम मेरे साथ चलोगी?", "मुझे माफ कर दो"]
for sample_input in sample_inputs:
    response = generate_response(sample_input)
    print(f"Input: {sample_input}\nResponse: {response}\n")

Loaded SPM model from: /content/bolly_chatbot/models/spm_bolly.model
SPM vocab size: 128
Loaded model weights into full model from: /content/bolly_chatbot/models/seq2seq_weights.weights.h5
Defined encoder model for inference
Defined decoder model for inference

Testing inference:




Input: क्या तुम मेरे साथ चलोगी?
Response: ▁रही▁रही▁रहीसा<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>

Input: मुझे माफ कर दो
Response: ▁<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>



In [25]:
# Cell 4 - Inference - Retry
from pathlib import Path
import numpy as np
import sentencepiece as spm
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences

BASE = Path("/content/bolly_chatbot")
MODELS = BASE / "models"

# Load SentencePiece model
sp_prefix = str(MODELS / "spm_bolly")
sp = spm.SentencePieceProcessor()
sp.Load(sp_prefix + ".model")
V = sp.GetPieceSize()
print("Loaded SPM model from:", sp_prefix + ".model")
print("SPM vocab size:", V)

# Model parameters (should match training)
max_enc = 16
max_dec = 15
emb_dim = 48
units = 48

# Recreate the full model structure used during training to load weights
# Encoder
enc_input_full = Input(shape=(max_enc,), name='enc_input')
enc_emb_layer_full = Embedding(V, emb_dim, mask_zero=True, name='enc_embedding')
emb_e_full = enc_emb_layer_full(enc_input_full)
enc_lstm_full = LSTM(units, return_state=True, name='encoder_lstm')
enc_outputs_full, sh_full, sc_full = enc_lstm_full(emb_e_full)

# Decoder
dec_input_full = Input(shape=(max_dec,), name='dec_input')
dec_emb_layer_full = Embedding(V, emb_dim, mask_zero=True, name='dec_embedding')
emb_d_full = dec_emb_layer_full(dec_input_full)
# Add Reshape layer as used in training
reshaped_emb_d_full = Reshape((max_dec, emb_dim))(emb_d_full)
dec_lstm_full = LSTM(units, return_sequences=True, return_state=True, name='decoder_lstm')
dec_outs_full, _, _ = dec_lstm_full(reshaped_emb_d_full, initial_state=[sh_full, sc_full])
dec_dense_full = Dense(V, activation='softmax', name='decoder_dense')
logits_full = dec_dense_full(dec_outs_full)

# Full model for loading weights
full_model = Model([enc_input_full, dec_input_full], logits_full)

# Load trained weights into the full model
full_model.load_weights(str(MODELS / "seq2seq_weights.weights.h5"))
print("Loaded model weights into full model from:", MODELS / "seq2seq_weights.weights.h5")


# Now define the encoder model for inference
enc_input = Input(shape=(max_enc,), name='enc_input')
enc_emb_layer = full_model.get_layer('enc_embedding')
emb_e = enc_emb_layer(enc_input)
enc_lstm = full_model.get_layer('encoder_lstm')
enc_outputs, state_h, state_c = enc_lstm(emb_e)
encoder_model = Model(enc_input, [state_h, state_c])
print("Defined encoder model for inference")

# Define the decoder model for inference
dec_input = Input(shape=(1,), name='dec_input') # Decoder input is one token at a time
dec_state_h = Input(shape=(units,), name='dec_state_h_input')
dec_state_c = Input(shape=(units,), name='dec_state_c_input')
dec_states_inputs = [dec_state_h, dec_state_c]

dec_emb_layer = full_model.get_layer('dec_embedding') # Reuse embedding layer
emb_d = dec_emb_layer(dec_input)

dec_lstm = full_model.get_layer('decoder_lstm') # Reuse LSTM layer
dec_outputs, state_h_out, state_c_out = dec_lstm(emb_d, initial_state=dec_states_inputs)
dec_states_outputs = [state_h_out, state_c_out]

dec_dense = full_model.get_layer('decoder_dense') # Reuse dense layer
output_tokens = dec_dense(dec_outputs)

decoder_model = Model([dec_input] + dec_states_inputs, [output_tokens] + dec_states_outputs)
print("Defined decoder model for inference")


# Define the inference function
def generate_response(input_sentence, max_length=max_dec):
    # Preprocess the input sentence
    input_seq = sp.EncodeAsIds("<s> " + input_sentence + " </s>")
    input_seq = pad_sequences([input_seq], maxlen=max_enc, padding='post')

    # Get the initial states from the encoder
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Start the decoder with the start token
    start_token = sp.PieceToId("<s>")
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token

    # Sampling loop for a batch of sequences
    decoded_sentence = []
    for _ in range(max_length):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)

        # Sample a token (greedy approach for simplicity)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Add a check to ensure the sampled index is within the valid range
        if sampled_token_index < 0 or sampled_token_index >= V:
            print(f"Warning: Sampled token index {sampled_token_index} is out of vocabulary range [0, {V-1}]. Stopping generation.")
            break

        # Explicitly cast to int just in case
        sampled_token = sp.IdToPiece(int(sampled_token_index))

        # Exit condition: hitting stop character or max length
        if sampled_token == "</s>":
            break

        # Append token
        decoded_sentence.append(sampled_token)

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    # Join the tokens to form the sentence, handling SentencePiece subwords
    return "".join(decoded_sentence).replace(" ", " ").strip()

# Test the inference function
print("\nTesting inference:")
sample_inputs = ["क्या तुम मेरे साथ चलोगी?", "मुझे माफ कर दो"]
for sample_input in sample_inputs:
    response = generate_response(sample_input)
    print(f"Input: {sample_input}\nResponse: {response}\n")

Loaded SPM model from: /content/bolly_chatbot/models/spm_bolly.model
SPM vocab size: 128
Loaded model weights into full model from: /content/bolly_chatbot/models/seq2seq_weights.weights.h5
Defined encoder model for inference
Defined decoder model for inference

Testing inference:




Input: क्या तुम मेरे साथ चलोगी?
Response: ▁रही▁रही▁रहीसा<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>

Input: मुझे माफ कर दो
Response: ▁<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>



In [17]:
# Cell 4 - Inference - Retry 2: Address TypeError in sp.IdToPiece
from pathlib import Path
import numpy as np
import sentencepiece as spm
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences

BASE = Path("/content/bolly_chatbot")
MODELS = BASE / "models"

# Load SentencePiece model
sp_prefix = str(MODELS / "spm_bolly")
sp = spm.SentencePieceProcessor()
sp.Load(sp_prefix + ".model")
V = sp.GetPieceSize()
print("Loaded SPM model from:", sp_prefix + ".model")
print("SPM vocab size:", V)

# Model parameters (should match training)
max_enc = 16
max_dec = 15
emb_dim = 48
units = 48

# Recreate the full model structure used during training to load weights
# Encoder
enc_input_full = Input(shape=(max_enc,), name='enc_input')
enc_emb_layer_full = Embedding(V, emb_dim, mask_zero=True, name='enc_embedding')
emb_e_full = enc_emb_layer_full(enc_input_full)
enc_lstm_full = LSTM(units, return_state=True, name='encoder_lstm')
enc_outputs_full, sh_full, sc_full = enc_lstm_full(emb_e_full)

# Decoder
dec_input_full = Input(shape=(max_dec,), name='dec_input')
dec_emb_layer_full = Embedding(V, emb_dim, mask_zero=True, name='dec_embedding')
emb_d_full = dec_emb_layer_full(dec_input_full)
# Add Reshape layer as used in training
reshaped_emb_d_full = Reshape((max_dec, emb_dim))(emb_d_full)
dec_lstm_full = LSTM(units, return_sequences=True, return_state=True, name='decoder_lstm')
dec_outs_full, _, _ = dec_lstm_full(reshaped_emb_d_full, initial_state=[sh_full, sc_full])
dec_dense_full = Dense(V, activation='softmax', name='decoder_dense')
logits_full = dec_dense_full(dec_outs_full)

# Full model for loading weights
full_model = Model([enc_input_full, dec_input_full], logits_full)

# Load trained weights into the full model
full_model.load_weights(str(MODELS / "seq2seq_weights.weights.h5"))
print("Loaded model weights into full model from:", MODELS / "seq2seq_weights.weights.h5")


# Now define the encoder model for inference
enc_input = Input(shape=(max_enc,), name='enc_input')
enc_emb_layer = full_model.get_layer('enc_embedding')
emb_e = enc_emb_layer(enc_input)
enc_lstm = full_model.get_layer('encoder_lstm')
enc_outputs, state_h, state_c = enc_lstm(emb_e)
encoder_model = Model(enc_input, [state_h, state_c])
print("Defined encoder model for inference")

# Define the decoder model for inference
dec_input = Input(shape=(1,), name='dec_input') # Decoder input is one token at a time
dec_state_h = Input(shape=(units,), name='dec_state_h_input')
dec_state_c = Input(shape=(units,), name='dec_state_c_input')
dec_states_inputs = [dec_state_h, dec_state_c]

dec_emb_layer = full_model.get_layer('dec_embedding') # Reuse embedding layer
emb_d = dec_emb_layer(dec_input)

dec_lstm = full_model.get_layer('decoder_lstm') # Reuse LSTM layer
dec_outputs, state_h_out, state_c_out = dec_lstm(emb_d, initial_state=dec_states_inputs)
dec_states_outputs = [state_h_out, state_c_out]

dec_dense = full_model.get_layer('decoder_dense') # Reuse dense layer
output_tokens = dec_dense(dec_outputs)

decoder_model = Model([dec_input] + dec_states_inputs, [output_tokens] + dec_states_outputs)
print("Defined decoder model for inference")


# Define the inference function
def generate_response(input_sentence, max_length=max_dec):
    # Preprocess the input sentence
    input_seq = sp.EncodeAsIds("<s> " + input_sentence + " </s>")
    input_seq = pad_sequences([input_seq], maxlen=max_enc, padding='post')

    # Get the initial states from the encoder
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Start the decoder with the start token
    start_token = sp.PieceToId("<s>")
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token

    # Sampling loop for a batch of sequences
    decoded_sentence = []
    for _ in range(max_length):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)

        # Sample a token (greedy approach for simplicity)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Add a check to ensure the sampled index is within the valid range
        if sampled_token_index < 0 or sampled_token_index >= V:
            print(f"Warning: Sampled token index {sampled_token_index} is out of vocabulary range [0, {V-1}]. Stopping generation.")
            break

        # Explicitly cast to int just in case
        sampled_token = sp.IdToPiece(int(sampled_token_index))

        # Exit condition: hitting stop character or max length
        if sampled_token == "</s>":
            break

        # Append token
        decoded_sentence.append(sampled_token)

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    # Join the tokens to form the sentence, handling SentencePiece subwords
    return "".join(decoded_sentence).replace(" ", " ").strip()

# Test the inference function
print("\nTesting inference:")
sample_inputs = ["क्या तुम मेरे साथ चलोगी?", "मुझे माफ कर दो"]
for sample_input in sample_inputs:
    response = generate_response(sample_input)
    print(f"Input: {sample_input}\nResponse: {response}\n")

Loaded SPM model from: /content/bolly_chatbot/models/spm_bolly.model
SPM vocab size: 128
Loaded model weights into full model from: /content/bolly_chatbot/models/seq2seq_weights.weights.h5
Defined encoder model for inference
Defined decoder model for inference

Testing inference:




Input: क्या तुम मेरे साथ चलोगी?
Response: ▁रही▁रही▁रहीसा<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>

Input: मुझे माफ कर दो
Response: ▁<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>



In [18]:
# Cell 5 - Flask App
from flask import Flask, request, jsonify, render_template_string

# Assuming generate_response and sp are available from the previous cell
# from __main__ import generate_response, sp # This is not reliable in all notebook environments

# Define the Flask application instance
app = Flask(__name__)

# HTML template for the home page
HTML_TEMPLATE = """
<!doctype html>
<html>
<head><title>Bolly Chatbot</title></head>
<body>
    <h1>Bolly Chatbot</h1>
    <form id="chat-form">
        <input type="text" id="user-input" placeholder="Enter your message">
        <button type="submit">Send</button>
    </form>
    <div id="chat-output"></div>

    <script>
        document.getElementById('chat-form').onsubmit = async function(event) {
            event.preventDefault();
            const userInput = document.getElementById('user-input').value;
            const chatOutput = document.getElementById('chat-output');

            // Display user input
            chatOutput.innerHTML += `<p><strong>You:</strong> ${userInput}</p>`;

            // Clear input field
            document.getElementById('user-input').value = '';

            // Send message to backend
            const response = await fetch('/chat', {
                method: 'POST',
                headers: {
                    'Content-Type': 'application/json'
                },
                body: JSON.stringify({ message: userInput })
            });

            const data = await response.json();

            // Display bot response
            chatOutput.innerHTML += `<p><strong>Bot:</strong> ${data.response}</p>`;
        };
    </script>
</body>
</html>
"""

@app.route('/')
def home():
    return render_template_string(HTML_TEMPLATE)

@app.route('/chat', methods=['POST'])
def chat():
    user_message = request.json.get('message')
    if not user_message:
        return jsonify({"response": "Error: No message received."}), 400

    # Call the generate_response function
    # This function needs to be accessible in this scope.
    # In a real app, you might pass it or import it properly.
    # For this notebook context, we assume it's in the global scope.
    try:
        bot_response = generate_response(user_message)
    except Exception as e:
        # Log the error for debugging
        print(f"Error generating response: {e}")
        bot_response = "Sorry, I am unable to respond at the moment."

    return jsonify({"response": bot_response})

# To run the app in a notebook, you typically use ngrok or similar.
# We'll add the ngrok part in a separate cell to make it easier to stop and restart.

print("Flask app defined. Use ngrok in the next cell to expose it.")


Flask app defined. Use ngrok in the next cell to expose it.


In [19]:
# Cell 6 - Expose Flask app with ngrok
from pyngrok import ngrok
import threading
import time

# Terminate any previous ngrok tunnels
ngrok.kill()

# Define a function to run the Flask app
def run_flask_app():
    # Use app.run() with debug=False for ngrok
    app.run(host='0.0.0.0', port=5000, debug=False)

# Run the Flask app in a separate thread
thread = threading.Thread(target=run_flask_app)
thread.start()
print("Flask app is running in a separate thread.")

# Give the Flask app a moment to start
time.sleep(2)

# Set up ngrok tunnel
try:
    # Connect to the Flask port
    public_url = ngrok.connect(5000).public_url
    print(f" * ngrok tunnel is live at: {public_url}")
    print(" * You can access the chatbot via this URL.")
except Exception as e:
    print(f"Error starting ngrok: {e}")
    print("Could not establish ngrok tunnel. The Flask app might not be running or port 5000 is in use.")


Flask app is running in a separate thread.
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m




ERROR:pyngrok.process.ngrok:t=2025-08-14T09:03:58+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2025-08-14T09:03:58+0000 lvl=eror msg="session closing" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2025-08-14T09:03:58+0000 lvl=eror msg="terminating with error" obj=app err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your aut

Error starting ngrok: The ngrok process errored on start: authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n.
Could not establish ngrok tunnel. The Flask app might not be running or port 5000 is in use.


In [20]:
# Cell 7 - Configure ngrok authtoken and expose Flask app
from pyngrok import ngrok
import threading
import time
import os

# Terminate any previous ngrok tunnels
ngrok.kill()

# Get ngrok authtoken from environment variables or configure it directly
# It's recommended to set this as an environment variable in a real application.
# For Colab, you can add it in a secrets manager or directly here for demonstration.
# Replace 'YOUR_NGROK_AUTHTOKEN' with your actual authtoken if not using env var
NGROK_AUTH_TOKEN = os.environ.get("NGROK_AUTH_TOKEN", "YOUR_NGROK_AUTHTOKEN")

if NGROK_AUTH_TOKEN == "YOUR_NGROK_AUTHTOKEN":
    print("WARNING: ngrok authtoken is not set. Please replace 'YOUR_NGROK_AUTHTOKEN' with your actual token or set the NGROK_AUTH_TOKEN environment variable.")
    # Attempt to configure even if it's the placeholder, ngrok will likely fail but show the error again.
    try:
        ngrok.set_auth_token(NGROK_AUTH_TOKEN)
        print("Attempted to set ngrok authtoken.")
    except Exception as e:
        print(f"Error setting ngrok authtoken: {e}")
else:
    try:
        ngrok.set_auth_token(NGROK_AUTH_TOKEN)
        print("ngrok authtoken configured.")
    except Exception as e:
        print(f"Error setting ngrok authtoken: {e}")


# Define a function to run the Flask app
def run_flask_app():
    # Use app.run() with debug=False for ngrok
    # Ensure 'app' is accessible, assuming it's defined in a previous cell's global scope
    global app
    if 'app' not in globals():
        print("Error: Flask app 'app' not found. Please ensure the cell defining the Flask app is run first.")
        return
    try:
        app.run(host='0.0.0.0', port=5000, debug=False)
    except Exception as e:
        print(f"Error running Flask app: {e}")


# Run the Flask app in a separate thread only if it's not already running
# Check if the thread exists and is alive to prevent starting multiple times
if 'thread' not in globals() or not thread.is_alive():
    thread = threading.Thread(target=run_flask_app)
    thread.daemon = True # Allow the main program to exit even if the thread is running
    thread.start()
    print("Flask app is running in a separate thread.")
else:
    print("Flask app thread is already running.")

# Give the Flask app a moment to start
time.sleep(2)

# Set up ngrok tunnel
try:
    # Connect to the Flask port
    public_url = ngrok.connect(5000).public_url
    print(f" * ngrok tunnel is live at: {public_url}")
    print(" * You can access the chatbot via this URL.")
    print("\nNOTE: Keep this cell running to keep the ngrok tunnel active. To stop, interrupt the kernel.")
except Exception as e:
    print(f"Error starting ngrok tunnel: {e}")
    print("Could not establish ngrok tunnel. Ensure your authtoken is correct and the Flask app is running.")


Attempted to set ngrok authtoken.
Flask app thread is already running.


ERROR:pyngrok.process.ngrok:t=2025-08-14T09:04:24+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: The authtoken you specified is an ngrok v1 authtoken, but you're using ngrok v2.\nYour authtoken: YOUR_NGROK_AUTHTOKEN\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_106\r\n"


Error starting ngrok tunnel: The ngrok process errored on start: authentication failed: The authtoken you specified is an ngrok v1 authtoken, but you're using ngrok v2.\nYour authtoken: YOUR_NGROK_AUTHTOKEN\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_106\r\n.
Could not establish ngrok tunnel. Ensure your authtoken is correct and the Flask app is running.


In [29]:
# Cell 7 - Configure ngrok authtoken and expose Flask app - Retry with Authtoken guidance
from pyngrok import ngrok
import threading
import time
import os

# Terminate any previous ngrok tunnels
ngrok.kill()

# --- IMPORTANT: Replace 'YOUR_NGROK_AUTHTOKEN' with your actual ngrok authtoken ---
# 1. Go to ngrok.com and sign up or log in.
# 2. Go to the 'Your Authtoken' page (usually under Setup & Installation).
# 3. Copy your authtoken.
# 4. Paste your authtoken below, replacing 'YOUR_NGROK_AUTHTOKEN'.
NGROK_AUTH_TOKEN = "31FKMpD0BYx5HaH142dOEAZOGuN_75D6PanqqWVcxqzergDGU" # <--- PASTE YOUR AUTHTOKEN HERE

# Alternatively, you can set the NGROK_AUTH_TOKEN environment variable
# NGROK_AUTH_TOKEN = os.environ.get("NGROK_AUTH_TOKEN", "YOUR_NGROK_AUTHTOKEN")
# print(f"Using NGROK_AUTH_TOKEN: {NGROK_AUTH_TOKEN}")

if NGROK_AUTH_TOKEN == "YOUR_NGROK_AUTHTOKEN":
    print("ERROR: ngrok authtoken is NOT set. Please replace 'YOUR_NGROK_AUTHTOKEN' with your actual token from ngrok.com.")
    # Do not attempt to configure or connect if the placeholder is still there.
else:
    try:
        ngrok.set_auth_token(NGROK_AUTH_TOKEN)
        print("ngrok authtoken configured.")

        # Define a function to run the Flask app
        def run_flask_app():
            # Use app.run() with debug=False for ngrok
            # Ensure 'app' is accessible, assuming it's defined in a previous cell's global scope
            global app
            if 'app' not in globals():
                print("Error: Flask app 'app' not found. Please ensure the cell defining the Flask app is run first.")
                return
            try:
                # Suppress Flask's default output when running with debug=False and ngrok
                import logging
                log = logging.getLogger('werkzeug')
                log.setLevel(logging.ERROR)
                app.run(host='0.0.0.0', port=5000, debug=False)
            except Exception as e:
                print(f"Error running Flask app: {e}")


        # Run the Flask app in a separate thread only if it's not already running
        # Check if the thread exists and is alive to prevent starting multiple times
        if 'thread' not in globals() or not thread.is_alive():
            print("Starting Flask app in a separate thread...")
            thread = threading.Thread(target=run_flask_app)
            thread.daemon = True # Allow the main program to exit even if the thread is running
            thread.start()
            print("Flask app thread started.")

            # Give the Flask app a moment to start
            time.sleep(3) # Increased sleep time

        else:
            print("Flask app thread is already running.")


        # Set up ngrok tunnel
        print("Attempting to establish ngrok tunnel...")
        try:
            # Connect to the Flask port
            public_url = ngrok.connect(5000).public_url
            print(f" * ngrok tunnel is live at: {public_url}")
            print(" * You can access the chatbot via this URL.")
            print("\nNOTE: Keep this cell running to keep the ngrok tunnel active. To stop, interrupt the kernel.")
        except Exception as e:
            print(f"Error starting ngrok tunnel: {e}")
            print("Could not establish ngrok tunnel. Ensure your authtoken is correct and the Flask app is running.")

    except Exception as e:
        print(f"An error occurred during ngrok setup: {e}")


ngrok authtoken configured.
Starting Flask app in a separate thread...
Flask app thread started.
 * Serving Flask app '__main__'
 * Debug mode: off


Address already in use
Port 5000 is in use by another program. Either identify and stop that program, or start the server with a different port.


Attempting to establish ngrok tunnel...
 * ngrok tunnel is live at: https://5fb0b932526e.ngrok-free.app
 * You can access the chatbot via this URL.

NOTE: Keep this cell running to keep the ngrok tunnel active. To stop, interrupt the kernel.


In [33]:
# Cell 7 - Configure ngrok authtoken and expose Flask app - Retry with Authtoken guidance
from pyngrok import ngrok
import threading
import time
import os

# Terminate any previous ngrok tunnels
ngrok.kill()

# --- IMPORTANT: Replace 'YOUR_NGROK_AUTHTOKEN' with your actual ngrok authtoken ---
# 1. Go to ngrok.com and sign up or log in.
# 2. Go to the 'Your Authtoken' page (usually under Setup & Installation).
# 3. Copy your authtoken.
# 4. Paste your authtoken below, replacing 'YOUR_NGROK_AUTHTOKEN'.
NGROK_AUTH_TOKEN = "31FKMpD0BYx5HaH142dOEAZOGuN_75D6PanqqWVcxqzergDGU" # <--- PASTE YOUR AUTHTOKEN HERE

# Alternatively, you can set the NGROK_AUTH_TOKEN environment variable
# NGROK_AUTH_TOKEN = os.environ.get("NGROK_AUTH_TOKEN", "YOUR_NGROK_AUTHTOKEN")
# print(f"Using NGROK_AUTH_TOKEN: {NGROK_AUTH_TOKEN}")

if NGROK_AUTH_TOKEN == "YOUR_NGROK_AUTHTOKEN":
    print("ERROR: ngrok authtoken is NOT set. Please replace 'YOUR_NGROK_AUTHTOKEN' with your actual token from ngrok.com.")
    # Do not attempt to configure or connect if the placeholder is still there.
else:
    try:
        ngrok.set_auth_token(NGROK_AUTH_TOKEN)
        print("ngrok authtoken configured.")

        # Define a function to run the Flask app
        def run_flask_app():
            # Use app.run() with debug=False for ngrok
            # Ensure 'app' is accessible, assuming it's defined in a previous cell's global scope
            global app
            if 'app' not in globals():
                print("Error: Flask app 'app' not found. Please ensure the cell defining the Flask app is run first.")
                return
            try:
                # Suppress Flask's default output when running with debug=False and ngrok
                import logging
                log = logging.getLogger('werkzeug')
                log.setLevel(logging.ERROR)
                app.run(host='0.0.0.0', port=5000, debug=False)
            except Exception as e:
                print(f"Error running Flask app: {e}")


        # Run the Flask app in a separate thread only if it's not already running
        # Check if the thread exists and is alive to prevent starting multiple times
        if 'thread' not in globals() or not thread.is_alive():
            print("Starting Flask app in a separate thread...")
            thread = threading.Thread(target=run_flask_app)
            thread.daemon = True # Allow the main program to exit even if the thread is running
            thread.start()
            print("Flask app thread started.")

            # Give the Flask app a moment to start
            time.sleep(3) # Increased sleep time

        else:
            print("Flask app thread is already running.")


        # Set up ngrok tunnel
        print("Attempting to establish ngrok tunnel...")
        try:
            # Connect to the Flask port
            public_url = ngrok.connect(5000).public_url
            print(f" * ngrok tunnel is live at: {public_url}")
            print(" * You can access the chatbot via this URL.")
            print("\nNOTE: Keep this cell running to keep the ngrok tunnel active. To stop, interrupt the kernel.")
        except Exception as e:
            print(f"Error starting ngrok tunnel: {e}")
            print("Could not establish ngrok tunnel. Ensure your authtoken is correct and the Flask app is running.")

    except Exception as e:
        print(f"An error occurred during ngrok setup: {e}")

ngrok authtoken configured.
Starting Flask app in a separate thread...
Flask app thread started.
 * Serving Flask app '__main__'
 * Debug mode: off


Address already in use
Port 5000 is in use by another program. Either identify and stop that program, or start the server with a different port.


Attempting to establish ngrok tunnel...
 * ngrok tunnel is live at: https://70438a06a5f4.ngrok-free.app
 * You can access the chatbot via this URL.

NOTE: Keep this cell running to keep the ngrok tunnel active. To stop, interrupt the kernel.


In [26]:
# Cell 5 - Flask App
from flask import Flask, request, jsonify, render_template_string

# Assuming generate_response and sp are available from the previous cell
# from __main__ import generate_response, sp # This is not reliable in all notebook environments

# Define the Flask application instance
app = Flask(__name__)

# HTML template for the home page
HTML_TEMPLATE = """
<!doctype html>
<html>
<head><title>Bolly Chatbot</title></head>
<body>
    <h1>Bolly Chatbot</h1>
    <form id="chat-form">
        <input type="text" id="user-input" placeholder="Enter your message">
        <button type="submit">Send</button>
    </form>
    <div id="chat-output"></div>

    <script>
        document.getElementById('chat-form').onsubmit = async function(event) {
            event.preventDefault();
            const userInput = document.getElementById('user-input').value;
            const chatOutput = document.getElementById('chat-output');

            // Display user input
            chatOutput.innerHTML += `<p><strong>You:</strong> ${userInput}</p>`;

            // Clear input field
            document.getElementById('user-input').value = '';

            // Send message to backend
            const response = await fetch('/chat', {
                method: 'POST',
                headers: {
                    'Content-Type': 'application/json'
                },
                body: JSON.stringify({ message: userInput })
            });

            const data = await response.json();

            // Display bot response
            chatOutput.innerHTML += `<p><strong>Bot:</strong> ${data.response}</p>`;
        };
    </script>
</body>
</html>
"""

@app.route('/')
def home():
    return render_template_string(HTML_TEMPLATE)

@app.route('/chat', methods=['POST'])
def chat():
    user_message = request.json.get('message')
    if not user_message:
        return jsonify({"response": "Error: No message received."}), 400

    # Call the generate_response function
    # This function needs to be accessible in this scope.
    # In a real app, you might pass it or import it properly.
    # For this notebook context, we assume it's in the global scope.
    try:
        bot_response = generate_response(user_message)
    except Exception as e:
        # Log the error for debugging
        print(f"Error generating response: {e}")
        bot_response = "Sorry, I am unable to respond at the moment."

    return jsonify({"response": bot_response})

# To run the app in a notebook, you typically use ngrok or similar.
# We'll add the ngrok part in a separate cell to make it easier to stop and restart.

print("Flask app defined. Use ngrok in the next cell to expose it.")

Flask app defined. Use ngrok in the next cell to expose it.


In [27]:
# Cell 6 - Expose Flask app with ngrok
from pyngrok import ngrok
import threading
import time

# Terminate any previous ngrok tunnels
ngrok.kill()

# Define a function to run the Flask app
def run_flask_app():
    # Use app.run() with debug=False for ngrok
    app.run(host='0.0.0.0', port=5000, debug=False)

# Run the Flask app in a separate thread
thread = threading.Thread(target=run_flask_app)
thread.start()
print("Flask app is running in a separate thread.")

# Give the Flask app a moment to start
time.sleep(2)

# Set up ngrok tunnel
try:
    # Connect to the Flask port
    public_url = ngrok.connect(5000).public_url
    print(f" * ngrok tunnel is live at: {public_url}")
    print(" * You can access the chatbot via this URL.")
except Exception as e:
    print(f"Error starting ngrok: {e}")
    print("Could not establish ngrok tunnel. The Flask app might not be running or port 5000 is in use.")

 * Serving Flask app '__main__'
 * Debug mode: off
Flask app is running in a separate thread.


Address already in use
Port 5000 is in use by another program. Either identify and stop that program, or start the server with a different port.
ERROR:pyngrok.process.ngrok:t=2025-08-14T09:08:45+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: The authtoken you specified is an ngrok v1 authtoken, but you're using ngrok v2.\nYour authtoken: YOUR_NGROK_AUTHTOKEN\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_106\r\n"
ERROR:pyngrok.process.ngrok:t=2025-08-14T09:08:45+0000 lvl=eror msg="session closing" obj=tunnels.session err="authentication failed: The authtoken you specified is an ngrok v1 authtoken, but you're using ngrok v2.\nYour authtoken: YOUR_NGROK_AUTHTOKEN\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_106\r\n"
ERROR:pyngrok.process.ngrok:t=2025-

Error starting ngrok: The ngrok process errored on start: authentication failed: The authtoken you specified is an ngrok v1 authtoken, but you're using ngrok v2.\nYour authtoken: YOUR_NGROK_AUTHTOKEN\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_106\r\n.
Could not establish ngrok tunnel. The Flask app might not be running or port 5000 is in use.


In [34]:
# Cell 2 - Collect and prepare data - Update for new dataset

# Assuming the new dataset is in the same /content/bolly_chatbot/data/ directory
# and might include multiple .srt files or similar text files.
# Modify the glob pattern if the new files have a different extension.

lines = []
# Updated glob pattern to potentially include more files
for p in DATA.glob("*.srt"): # Adjust pattern if necessary, e.g., "*.txt" or "*.*"
    print(f"Parsing file: {p}")
    lines += parse_srt_to_lines(p)

print("Lines found:", len(lines))

# Build pairs from the collected lines
pairs = build_pairs_from_lines(lines)
print("Pairs:", len(pairs))

if len(pairs) == 0:
    raise SystemExit("No pairs found. Ensure the new dataset files are in /content/bolly_chatbot/data/ and have the correct file extension.")

# Write the updated corpus
with open(CORPUS, "w", encoding="utf-8") as f:
    for a, b in pairs:
        f.write(a + "\n")
        f.write(b + "\n")
print("Wrote updated corpus:", CORPUS)

# Display the first few pairs to verify
print("\nSample pairs from the new dataset:")
for i, pair in enumerate(pairs[:5]):
    print(f"Pair {i+1}: {pair}")


Parsing file: /content/bolly_chatbot/data/sample_subs.srt
Lines found: 3
Pairs: 2
Wrote updated corpus: /content/bolly_chatbot/data/corpus.txt

Sample pairs from the new dataset:
Pair 1: ('क्या तुम मेरे साथ चलोगी?', 'क्यों नहीं, मैं आ रही हूँ।')
Pair 2: ('क्यों नहीं, मैं आ रही हूँ।', 'मुझे माफ कर दो, मेरी गलती थी।')


In [35]:
# Cell 3 - Train small SentencePiece and tiny seq2seq (fast) - Retry with updated data

# Ensure SentencePiece model is trained only if it doesn't exist or force retraining
# For retraining with new data, we should overwrite the existing model
sp_prefix = str(MODELS / "spm_bolly")
spm.SentencePieceTrainer.Train(f"--input={CORPUS} --model_prefix={sp_prefix} --vocab_size=128 --character_coverage=0.9995 --model_type=bpe --user_defined_symbols=<s>,</s>")
print("Trained SPM:", sp_prefix + ".model")


# prepare encoded data
sp = spm.SentencePieceProcessor(); sp.Load(sp_prefix + ".model")
V = sp.GetPieceSize() # Update vocabulary size based on the newly trained model
print("SPM vocab size:", V)

def encode(s, maxlen=16):
    txt = "<s> " + s + " </s>"
    encoded_ids = sp.EncodeAsIds(txt)
    # Ensure truncation happens if needed
    return encoded_ids[:maxlen]

max_enc=16; max_dec=15
encs=[]; decins=[]; decouts=[]
for a,b in pairs:
    e = encode(a,max_enc)
    d = encode(b,max_dec+1)
    if len(d) < 2: continue # Skip empty or single-token decoder sequences
    encs.append(e)
    decins.append(d[:-1])
    decouts.append(d[1:])

# Convert to numpy arrays and pad
# Ensure there are sequences to process before padding
if not encs:
    raise SystemExit("No valid sequences generated from pairs. Check data and encoding.")

encs = pad_sequences(encs, maxlen=max_enc, padding='post')
decins = pad_sequences(decins, maxlen=max_dec, padding='post')
decouts = pad_sequences(decouts, maxlen=max_dec, padding='post')
decouts = np.expand_dims(decouts, -1) # Add a dimension for sparse_categorical_crossentropy

print(f"Prepared {len(encs)} encoded sequences.")
print("Encoder shape:", encs.shape)
print("Decoder input shape:", decins.shape)
print("Decoder output shape:", decouts.shape)


# define tiny seq2seq model
# Reuse model definition from previous attempts, ensuring layer names match
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Reshape
from tensorflow.keras.models import Model

emb_dim=48; units=48 # Use the same dimensions as before

# Encoder
enc_input = Input(shape=(max_enc,), name='enc_input')
# Use the updated vocabulary size V
enc_emb_layer = Embedding(V, emb_dim, mask_zero=True, name='enc_embedding')
emb_e = enc_emb_layer(enc_input)
enc_lstm = LSTM(units, return_state=True, name='encoder_lstm')
_, sh, sc = enc_lstm(emb_e) # Don't need the full sequence output from encoder LSTM

# Decoder
dec_input = Input(shape=(max_dec,), name='dec_input')
# Use the updated vocabulary size V
dec_emb_layer = Embedding(V, emb_dim, mask_zero=True, name='dec_embedding')
emb_d = dec_emb_layer(dec_input)
# Add Reshape layer to explicitly set shape if needed (matching training structure)
# Check if this Reshape was truly necessary or caused warnings/errors before.
# Based on previous runs, the warning about mask loss suggests it might be
# better to remove if possible, but let's keep it for now to match the structure
# that successfully loaded weights previously.
# If the model doesn't train or predict correctly, this might be a place to revisit.
# Let's add the Reshape back as it was in the successful inference attempts.
reshaped_emb_d = Reshape((max_dec, emb_dim))(emb_d)

dec_lstm = LSTM(units, return_sequences=True, return_state=True, name='decoder_lstm')
# Initial state should be from the encoder
dec_outs, _, _ = dec_lstm(reshaped_emb_d, initial_state=[sh, sc])
# Use the updated vocabulary size V
dec_dense = Dense(V, activation='softmax', name='decoder_dense')
logits = dec_dense(dec_outs)

# Define the model
model = Model([enc_input, dec_input], logits)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Print model summary to verify structure
model.summary()


print("Training tiny model...")
# Train the model on the new data
# Adjust epochs and batch size if needed based on dataset size and performance
model.fit([encs, decins], decouts, batch_size=4, epochs=10, verbose=2) # Increased epochs slightly


# save artifacts
# Save weights after successful training
model.save_weights(str(MODELS / "seq2seq_weights.weights.h5"))
# SentencePiece model is already saved during the SentencePieceTrainer.Train call
print("Saved weights to", MODELS / "seq2seq_weights.weights.h5")
print("SPM vocab size:", sp.GetPieceSize())

Trained SPM: /content/bolly_chatbot/models/spm_bolly.model
SPM vocab size: 128
Prepared 2 encoded sequences.
Encoder shape: (2, 16)
Decoder input shape: (2, 15)
Decoder output shape: (2, 15, 1)




Training tiny model...
Epoch 1/10
1/1 - 5s - 5s/step - loss: 4.8486
Epoch 2/10
1/1 - 0s - 60ms/step - loss: 4.8419
Epoch 3/10
1/1 - 0s - 61ms/step - loss: 4.8350
Epoch 4/10
1/1 - 0s - 61ms/step - loss: 4.8279
Epoch 5/10
1/1 - 0s - 59ms/step - loss: 4.8204
Epoch 6/10
1/1 - 0s - 73ms/step - loss: 4.8122
Epoch 7/10
1/1 - 0s - 67ms/step - loss: 4.8033
Epoch 8/10
1/1 - 0s - 59ms/step - loss: 4.7935
Epoch 9/10
1/1 - 0s - 59ms/step - loss: 4.7825
Epoch 10/10
1/1 - 0s - 59ms/step - loss: 4.7700
Saved weights to /content/bolly_chatbot/models/seq2seq_weights.weights.h5
SPM vocab size: 128


In [36]:
# Cell 4 - Inference - Test with retrained model
# Reusing the inference code from previous attempts

# Load SentencePiece model
sp_prefix = str(MODELS / "spm_bolly")
sp = spm.SentencePieceProcessor()
sp.Load(sp_prefix + ".model")
V = sp.GetPieceSize()
print("Loaded SPM model from:", sp_prefix + ".model")
print("SPM vocab size:", V)

# Model parameters (should match training)
max_enc = 16
max_dec = 15
emb_dim = 48
units = 48

# Recreate the full model structure used during training to load weights
# Ensure layer names match exactly
# Encoder
enc_input_full = Input(shape=(max_enc,), name='enc_input')
# Use the updated vocabulary size V
enc_emb_layer_full = Embedding(V, emb_dim, mask_zero=True, name='enc_embedding')
emb_e_full = enc_emb_layer_full(enc_input_full)
enc_lstm_full = LSTM(units, return_state=True, name='encoder_lstm')
enc_outputs_full, sh_full, sc_full = enc_lstm_full(emb_e_full)

# Decoder
dec_input_full = Input(shape=(max_dec,), name='dec_input')
# Use the updated vocabulary size V
dec_emb_layer_full = Embedding(V, emb_dim, mask_zero=True, name='dec_embedding')
emb_d_full = dec_emb_layer_full(dec_input_full)
# Add Reshape layer as used in training
reshaped_emb_d_full = Reshape((max_dec, emb_dim))(emb_d_full)
dec_lstm_full = LSTM(units, return_sequences=True, return_state=True, name='decoder_lstm')
dec_outs_full, _, _ = dec_lstm_full(reshaped_emb_d_full, initial_state=[sh_full, sc_full])
dec_dense_full = Dense(V, activation='softmax', name='decoder_dense')
logits_full = dec_dense_full(dec_outs_full)

# Full model for loading weights
full_model = Model([enc_input_full, dec_input_full], logits_full)

# Load trained weights into the full model
full_model.load_weights(str(MODELS / "seq2seq_weights.weights.h5"))
print("Loaded model weights into full model from:", MODELS / "seq2seq_weights.weights.h5")


# Now define the encoder model for inference
enc_input = Input(shape=(max_enc,), name='enc_input')
enc_emb_layer = full_model.get_layer('enc_embedding')
emb_e = enc_emb_layer(enc_input)
enc_lstm = full_model.get_layer('encoder_lstm')
enc_outputs, state_h, state_c = enc_lstm(emb_e)
encoder_model = Model(enc_input, [state_h, state_c])
print("Defined encoder model for inference")

# Define the decoder model for inference
dec_input = Input(shape=(1,), name='dec_input') # Decoder input is one token at a time
dec_state_h = Input(shape=(units,), name='dec_state_h_input')
dec_state_c = Input(shape=(units,), name='dec_state_c_input')
dec_states_inputs = [dec_state_h, dec_state_c]

dec_emb_layer = full_model.get_layer('dec_embedding') # Reuse embedding layer
emb_d = dec_emb_layer(dec_input)

dec_lstm = full_model.get_layer('decoder_lstm') # Reuse LSTM layer
dec_outputs, state_h_out, state_c_out = dec_lstm(emb_d, initial_state=dec_states_inputs)
dec_states_outputs = [state_h_out, state_c_out]

dec_dense = full_model.get_layer('decoder_dense') # Reuse dense layer
output_tokens = dec_dense(dec_outputs)

decoder_model = Model([dec_input] + dec_states_inputs, [output_tokens] + dec_states_outputs)
print("Defined decoder model for inference")


# Define the inference function
def generate_response(input_sentence, max_length=max_dec):
    # Preprocess the input sentence
    input_seq = sp.EncodeAsIds("<s> " + input_sentence + " </s>")
    input_seq = pad_sequences([input_seq], maxlen=max_enc, padding='post')

    # Get the initial states from the encoder
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Start the decoder with the start token
    start_token = sp.PieceToId("<s>")
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token

    # Sampling loop for a batch of sequences
    decoded_sentence = []
    for _ in range(max_length):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)

        # Sample a token (greedy approach for simplicity)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Add a check to ensure the sampled index is within the valid range
        if sampled_token_index < 0 or sampled_token_index >= V:
            print(f"Warning: Sampled token index {sampled_token_index} is out of vocabulary range [0, {V-1}]. Stopping generation.")
            break

        # Explicitly cast to int just in case
        sampled_token = sp.IdToPiece(int(sampled_token_index))

        # Exit condition: hitting stop character or max length
        if sampled_token == "</s>":
            break

        # Append token
        decoded_sentence.append(sampled_token)

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    # Join the tokens to form the sentence, handling SentencePiece subwords
    return "".join(decoded_sentence).replace(" ", " ").strip()

# Test the inference function
print("\nTesting inference:")
# Use sample inputs relevant to the (potentially new) dataset
sample_inputs = ["क्या तुम मेरे साथ चलोगी?", "मुझे माफ कर दो", "hello", "how are you?"]
for sample_input in sample_inputs:
    response = generate_response(sample_input)
    print(f"Input: {sample_input}\nResponse: {response}\n")

Loaded SPM model from: /content/bolly_chatbot/models/spm_bolly.model
SPM vocab size: 128
Loaded model weights into full model from: /content/bolly_chatbot/models/seq2seq_weights.weights.h5
Defined encoder model for inference
Defined decoder model for inference

Testing inference:




Input: क्या तुम मेरे साथ चलोगी?
Response: ▁मैं<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>

Input: मुझे माफ कर दो
Response: ▁मैं▁मैं<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>

Input: hello
Response: ▁मैं▁मैं<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>

Input: how are you?
Response: ▁मैं▁मैं<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>



In [37]:
# Cell 8 - Train new SentencePiece model for combined Hinglish/English corpus
from pathlib import Path
import sentencepiece as spm
import os

# Define paths - assuming corpus.txt now contains combined data from previous step
BASE = Path("/content/bolly_chatbot")
MODELS = BASE / "models"
CORPUS = BASE / "data" / "corpus.txt" # Ensure this points to the combined corpus file

# Ensure the models directory exists
MODELS.mkdir(exist_ok=True)

# Define output path and prefix for the new SentencePiece model
sp_prefix = str(MODELS / "spm_bolly_combined") # New prefix for the larger model

# Define a larger vocabulary size
new_vocab_size = 2000 # Or 4000, or more, depending on dataset size

# Train SentencePiece (larger vocab for combined data)
print(f"Training SentencePiece model with vocab size {new_vocab_size} on {CORPUS}...")

# Ensure SentencePiece model is trained only if it doesn't exist
if not (Path(sp_prefix + ".model")).exists():
    try:
        spm.SentencePieceTrainer.Train(
            f"--input={CORPUS} "
            f"--model_prefix={sp_prefix} "
            f"--vocab_size={new_vocab_size} "
            f"--character_coverage=0.9995 "
            f"--model_type=bpe "
            f"--user_defined_symbols=<s>,</s>"
        )
        print("Trained SPM:", sp_prefix + ".model")
        print("SPM vocabulary size:", spm.SentencePieceProcessor().Load(sp_prefix + ".model").GetPieceSize())
    except Exception as e:
        print(f"Error during SentencePiece training: {e}")
else:
    print("SentencePiece model already exists:", sp_prefix + ".model")


Training SentencePiece model with vocab size 2000 on /content/bolly_chatbot/data/corpus.txt...
Error during SentencePiece training: Internal: src/trainer_interface.cc(662) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (2000). Please set it to a value <= 129.


In [38]:
# Cell 8 - Train new SentencePiece model for combined Hinglish/English corpus - Retry with smaller vocab size
from pathlib import Path
import sentencepiece as spm
import os

# Define paths - assuming corpus.txt now contains combined data from previous step
BASE = Path("/content/bolly_chatbot")
MODELS = BASE / "models"
CORPUS = BASE / "data" / "corpus.txt" # Ensure this points to the combined corpus file

# Ensure the models directory exists
MODELS.mkdir(exist_ok=True)

# Define output path and prefix for the new SentencePiece model
sp_prefix = str(MODELS / "spm_bolly_combined") # New prefix for the larger model

# Define a smaller, acceptable vocabulary size based on the error message
# The error suggested <= 129. Let's use 128 as it was used previously and worked.
new_vocab_size = 128

# Train SentencePiece (smaller vocab based on error)
print(f"Retrying SentencePiece model training with vocab size {new_vocab_size} on {CORPUS}...")

# Ensure SentencePiece model is trained only if it doesn't exist
# Remove the existing model files if they are from a failed attempt with the same prefix
model_file = Path(sp_prefix + ".model")
vocab_file = Path(sp_prefix + ".vocab")
if model_file.exists():
    print(f"Removing existing model file: {model_file}")
    model_file.unlink()
if vocab_file.exists():
    print(f"Removing existing vocab file: {vocab_file}")
    vocab_file.unlink()


try:
    spm.SentencePieceTrainer.Train(
        f"--input={CORPUS} "
        f"--model_prefix={sp_prefix} "
        f"--vocab_size={new_vocab_size} "
        f"--character_coverage=0.9995 "
        f"--model_type=bpe "
        f"--user_defined_symbols=<s>,</s>"
    )
    print("Trained SPM:", sp_prefix + ".model")
    # Load the trained model to get the actual vocabulary size
    sp = spm.SentencePieceProcessor()
    sp.Load(sp_prefix + ".model")
    print("SPM vocabulary size:", sp.GetPieceSize())
except Exception as e:
    print(f"Error during SentencePiece training: {e}")


Retrying SentencePiece model training with vocab size 128 on /content/bolly_chatbot/data/corpus.txt...
Trained SPM: /content/bolly_chatbot/models/spm_bolly_combined.model
SPM vocabulary size: 128


In [39]:
# Cell 9 - Train seq2seq model on new combined data

# Define paths - assuming corpus.txt is updated and spm_bolly_combined.* exist
BASE = Path("/content/bolly_chatbot")
DATA = BASE / "data"
MODELS = BASE / "models"
CORPUS = DATA / "corpus.txt" # Should point to the combined corpus

# Load the new SentencePiece model
sp_prefix_combined = str(MODELS / "spm_bolly_combined")
sp = spm.SentencePieceProcessor()
sp.Load(sp_prefix_combined + ".model")
V = sp.GetPieceSize() # Get vocabulary size from the new model
print("Loaded SPM model from:", sp_prefix_combined + ".model")
print("SPM vocabulary size:", V)


# Model parameters (can keep same as tiny model for now, adjust if needed)
max_enc = 16
max_dec = 15
emb_dim = 48
units = 48

print(f"Model parameters: max_enc={max_enc}, max_dec={max_dec}, emb_dim={emb_dim}, units={units}, vocab_size={V}")

# Load the conversation pairs from the combined corpus
lines = []
# Re-read the corpus to get lines
with open(CORPUS, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

# Rebuild pairs from lines - assuming corpus.txt is line-by-line
pairs = []
for i in range(0, len(lines), 2):
    if i + 1 < len(lines):
        pairs.append((lines[i], lines[i+1]))

print("Pairs loaded from corpus:", len(pairs))
if len(pairs) == 0:
    raise SystemExit("No pairs found in corpus.txt. Ensure the file is correctly formatted.")


# Encode the conversation pairs
def encode(s, maxlen):
    txt = "<s> " + s + " </s>"
    encoded_ids = sp.EncodeAsIds(txt)
    # Truncate if longer than maxlen
    return encoded_ids[:maxlen]

encs = []
decins = []
decouts = []

for a, b in pairs:
    e = encode(a, max_enc)
    # Decoder input needs one less token than decoder output for shifted sequence
    d_in = encode(b, max_dec)
    d_out = encode(b, max_dec) # Max length for output sequence

    # Ensure sequence is not empty after encoding/truncation
    if len(e) == 0 or len(d_in) < 1 or len(d_out) < 1:
         continue

    # Shift decoder output by one token
    # Decoder input sequence should not contain the </s> token
    # Decoder output sequence should not contain the <s> token
    # Pad d_in and d_out to max_dec
    encs.append(e)
    decins.append(d_in)
    decouts.append(d_out)


# Pad sequences
encs = pad_sequences(encs, maxlen=max_enc, padding='post')
decins = pad_sequences(decins, maxlen=max_dec, padding='post')
decouts = pad_sequences(decouts, maxlen=max_dec, padding='post')

# Reshape decouts for sparse_categorical_crossentropy
decouts = np.expand_dims(decouts, -1)

print(f"Encoded samples: encs shape={encs.shape}, decins shape={decins.shape}, decouts shape={decouts.shape}")


# Define the seq2seq model architecture (ensure layer names match previous training)
# Encoder
enc_input = Input(shape=(max_enc,), name='enc_input')
enc_emb_layer = Embedding(V, emb_dim, mask_zero=True, name='enc_embedding')
emb_e = enc_emb_layer(enc_input)
enc_lstm = LSTM(units, return_state=True, name='encoder_lstm')
_, sh, sc = enc_lstm(emb_e)

# Decoder
dec_input = Input(shape=(max_dec,), name='dec_input')
dec_emb_layer = Embedding(V, emb_dim, mask_zero=True, name='dec_embedding')
emb_d = dec_emb_layer(dec_input)
# Add Reshape layer to explicitly set shape to (max_dec, emb_dim)
reshaped_emb_d = Reshape((max_dec, emb_dim))(emb_d)
dec_lstm = LSTM(units, return_sequences=True, return_state=True, name='decoder_lstm')
dec_outs, _, _ = dec_lstm(reshaped_emb_d, initial_state=[sh, sc])
dec_dense = Dense(V, activation='softmax', name='decoder_dense')
logits = dec_dense(dec_outs)

model_combined = Model([enc_input, dec_input], logits)

# Compile the model
model_combined.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

print("Training seq2seq model on combined data...")
# Train the model
# Use a larger number of epochs and batch size if the dataset is significantly larger
history = model_combined.fit([encs, decins], decouts, batch_size=32, epochs=10, verbose=1)

# Save the trained model's weights
weights_path_combined = MODELS / "seq2seq_combined_weights.weights.h5"
model_combined.save_weights(str(weights_path_combined))
print("Saved trained model weights to:", weights_path_combined)

Loaded SPM model from: /content/bolly_chatbot/models/spm_bolly_combined.model
SPM vocabulary size: 128
Model parameters: max_enc=16, max_dec=15, emb_dim=48, units=48, vocab_size=128
Pairs loaded from corpus: 2
Encoded samples: encs shape=(2, 16), decins shape=(2, 15), decouts shape=(2, 15, 1)
Training seq2seq model on combined data...
Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - loss: 4.8536
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - loss: 4.8481
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - loss: 4.8424
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - loss: 4.8365
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - loss: 4.8303
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - loss: 4.8237
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - loss: 4.8166
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - loss: 4.8088
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - loss: 4.8001
Epoch 10/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - loss: 4.7904
Saved trained model weights to: 

In [41]:
# Cell 10 - Inference with the new combined model
from pathlib import Path
import numpy as np
import sentencepiece as spm
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define paths
BASE = Path("/content/bolly_chatbot")
MODELS = BASE / "models"

# Load the newly trained SentencePiece model (step 1)
sp_prefix_combined = str(MODELS / "spm_bolly_combined")
sp = spm.SentencePieceProcessor()
sp.Load(sp_prefix_combined + ".model")
V = sp.GetPieceSize() # Get vocabulary size from the new model
print("Loaded SPM model from:", sp_prefix_combined + ".model")
print("SPM vocabulary size:", V)

# Model parameters (should match combined model training)
max_enc = 16
max_dec = 15
emb_dim = 48
units = 48

# Recreate the full model structure used during combined model training to load weights (step 2)
# Ensure layer names match exactly
# Encoder
enc_input_full = Input(shape=(max_enc,), name='enc_input')
# Use the updated vocabulary size V
enc_emb_layer_full = Embedding(V, emb_dim, mask_zero=True, name='enc_embedding')
emb_e_full = enc_emb_layer_full(enc_input_full)
enc_lstm_full = LSTM(units, return_state=True, name='encoder_lstm')
enc_outputs_full, sh_full, sc_full = enc_lstm_full(emb_e_full)

# Decoder
dec_input_full = Input(shape=(max_dec,), name='dec_input')
# Use the updated vocabulary size V
dec_emb_layer_full = Embedding(V, emb_dim, mask_zero=True, name='dec_embedding')
emb_d_full = dec_emb_layer_full(dec_input_full)
# Add Reshape layer to explicitly set shape to (max_dec, emb_dim)
reshaped_emb_d_full = Reshape((max_dec, emb_dim))(emb_d_full)
dec_lstm_full = LSTM(units, return_sequences=True, return_state=True, name='decoder_lstm')
dec_outs_full, _, _ = dec_lstm_full(reshaped_emb_d_full, initial_state=[sh_full, sc_full])
dec_dense_full = Dense(V, activation='softmax', name='decoder_dense')
logits_full = dec_dense_full(dec_outs_full)

# Full model for loading weights
full_model_combined = Model([enc_input_full, dec_input_full], logits_full)

# Load trained weights into the full model (step 2)
weights_path_combined = MODELS / "seq2seq_combined_weights.weights.h5"
full_model_combined.load_weights(str(weights_path_combined))
print("Loaded model weights into full model from:", weights_path_combined)


# Define the encoder model for inference (step 3)
enc_input = Input(shape=(max_enc,), name='enc_input')
enc_emb_layer = full_model_combined.get_layer('enc_embedding')
emb_e = enc_emb_layer(enc_input)
enc_lstm = full_model_combined.get_layer('encoder_lstm')
enc_outputs, state_h, state_c = enc_lstm(emb_e)
encoder_model_combined = Model(enc_input, [state_h, state_c])
print("Defined encoder model for inference")

# Define the decoder model for inference (step 3)
dec_input = Input(shape=(1,), name='dec_input') # Decoder input is one token at a time
dec_state_h = Input(shape=(units,), name='dec_state_h_input')
dec_state_c = Input(shape=(units,), name='dec_state_c_input')
dec_states_inputs = [dec_state_h, dec_state_c]

dec_emb_layer = full_model_combined.get_layer('dec_embedding') # Reuse embedding layer
emb_d = dec_emb_layer(dec_input)

dec_lstm = full_model_combined.get_layer('decoder_lstm') # Reuse LSTM layer
dec_outputs, state_h_out, state_c_out = dec_lstm(emb_d, initial_state=dec_states_inputs)
dec_states_outputs = [state_h_out, state_c_out]

dec_dense = full_model_combined.get_layer('decoder_dense') # Reuse dense layer
output_tokens = dec_dense(dec_outputs)

decoder_model_combined = Model([dec_input] + dec_states_inputs, [output_tokens] + dec_states_outputs)
print("Defined decoder model for inference")

# Define the inference function (step 4)
def generate_response_combined(input_sentence, max_length=max_dec):
    # Preprocess the input sentence using the combined SentencePiece model
    input_seq = sp.EncodeAsIds("<s> " + input_sentence + " </s>")
    input_seq = pad_sequences([input_seq], maxlen=max_enc, padding='post')

    # Get the initial states from the encoder (using the combined encoder model)
    states_value = encoder_model_combined.predict(input_seq, verbose=0)

    # Start the decoder with the start token
    start_token = sp.PieceToId("<s>")
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token

    # Sampling loop for a batch of sequences
    decoded_sentence = []
    for _ in range(max_length):
        # Predict the next token (using the combined decoder model)
        output_tokens, h, c = decoder_model_combined.predict([target_seq] + states_value, verbose=0)

        # Sample a token (greedy approach for simplicity)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Add a check to ensure the sampled index is within the valid range
        if sampled_token_index < 0 or sampled_token_index >= V:
            print(f"Warning: Sampled token index {sampled_token_index} is out of vocabulary range [0, {V-1}]. Stopping generation.")
            break

        # Explicitly cast to int just in case and convert ID to piece
        sampled_token = sp.IdToPiece(int(sampled_token_index))

        # Exit condition: hitting stop character or max length
        if sampled_token == "</s>":
            break

        # Append token
        decoded_sentence.append(sampled_token)

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    # Join the tokens to form the sentence, handling SentencePiece subwords
    return "".join(decoded_sentence).replace(" ", " ").strip()

# Test the inference function with relevant inputs
print("\nTesting inference with combined model:")
sample_inputs = ["क्या तुम मेरे साथ चलोगी?", "मुझे माफ कर दो", "hello", "how are you?"] # Include English/Hinglish examples
for sample_input in sample_inputs:
    response = generate_response_combined(sample_input)
    print(f"Input: {sample_input}\nResponse: {response}\n")


Loaded SPM model from: /content/bolly_chatbot/models/spm_bolly_combined.model
SPM vocabulary size: 128
Loaded model weights into full model from: /content/bolly_chatbot/models/seq2seq_combined_weights.weights.h5
Defined encoder model for inference
Defined decoder model for inference

Testing inference with combined model:




Input: क्या तुम मेरे साथ चलोगी?
Response: ▁▁▁<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>

Input: मुझे माफ कर दो
Response: ▁मुझे▁मुझे▁मुझे▁मुझे▁मुझे<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>

Input: hello
Response: ▁मुझे▁मुझे▁मुझे▁मुझे▁मुझे<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>

Input: how are you?
Response: ▁▁▁<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>



In [42]:
# Cell 2 - Collect and prepare data - Update for new dataset

# Assuming the new dataset is in the same /content/bolly_chatbot/data/ directory
# and might include multiple .srt files or similar text files.
# Modify the glob pattern if the new files have a different extension.
from pathlib import Path
BASE = Path("/content/bolly_chatbot")
DATA = BASE / "data"
MODELS = BASE / "models"
CORPUS = DATA / "corpus.txt" # Changed to DATA / "corpus.txt"

# import helpers
import sys
# Ensure the path is only appended if not already present
if str(BASE) not in sys.path:
    sys.path.append(str(BASE))
from utils import parse_srt_to_lines, build_pairs_from_lines


lines = []
# Updated glob pattern to potentially include more files
for p in DATA.glob("*.srt"): # Adjust pattern if necessary, e.g., "*.txt" or "*.*"
    print(f"Parsing file: {p}")
    lines += parse_srt_to_lines(p)

print("Lines found:", len(lines))

# Build pairs from the collected lines
pairs = build_pairs_from_lines(lines)
print("Pairs:", len(pairs))

if len(pairs) == 0:
    raise SystemExit("No pairs found. Ensure the new dataset files are in /content/bolly_chatbot/data/ and have the correct file extension.")

# Write the updated corpus
with open(CORPUS, "w", encoding="utf-8") as f:
    for a, b in pairs:
        f.write(a + "\n")
        f.write(b + "\n")
print("Wrote updated corpus:", CORPUS)

# Display the first few pairs to verify
print("\nSample pairs from the new dataset:")
for i, pair in enumerate(pairs[:5]):
    print(f"Pair {i+1}: {pair}")

Parsing file: /content/bolly_chatbot/data/sample_subs.srt
Lines found: 3
Pairs: 2
Wrote updated corpus: /content/bolly_chatbot/data/corpus.txt

Sample pairs from the new dataset:
Pair 1: ('क्या तुम मेरे साथ चलोगी?', 'क्यों नहीं, मैं आ रही हूँ।')
Pair 2: ('क्यों नहीं, मैं आ रही हूँ।', 'मुझे माफ कर दो, मेरी गलती थी।')


In [43]:
# Cell 3 - Train small SentencePiece and tiny seq2seq (fast) - Retry with updated data

# Ensure SentencePiece model is trained only if it doesn't exist or force retraining
# For retraining with new data, we should overwrite the existing model
sp_prefix = str(MODELS / "spm_bolly")
spm.SentencePieceTrainer.Train(f"--input={CORPUS} --model_prefix={sp_prefix} --vocab_size=128 --character_coverage=0.9995 --model_type=bpe --user_defined_symbols=<s>,</s>")
print("Trained SPM:", sp_prefix + ".model")


# prepare encoded data
sp = spm.SentencePieceProcessor(); sp.Load(sp_prefix + ".model")
V = sp.GetPieceSize() # Update vocabulary size based on the newly trained model
print("SPM vocab size:", V)

def encode(s, maxlen=16):
    txt = "<s> " + s + " </s>"
    encoded_ids = sp.EncodeAsIds(txt)
    # Ensure truncation happens if needed
    return encoded_ids[:maxlen]

max_enc=16; max_dec=15
encs=[]; decins=[]; decouts=[]
for a,b in pairs:
    e = encode(a,max_enc)
    d = encode(b,max_dec+1)
    if len(d) < 2: continue # Skip empty or single-token decoder sequences
    encs.append(e)
    decins.append(d[:-1])
    decouts.append(d[1:])

# Convert to numpy arrays and pad
# Ensure there are sequences to process before padding
if not encs:
    raise SystemExit("No valid sequences generated from pairs. Check data and encoding.")

encs = pad_sequences(encs, maxlen=max_enc, padding='post')
decins = pad_sequences(decins, maxlen=max_dec, padding='post')
decouts = pad_sequences(decouts, maxlen=max_dec, padding='post')
decouts = np.expand_dims(decouts, -1) # Add a dimension for sparse_categorical_crossentropy

print(f"Prepared {len(encs)} encoded sequences.")
print("Encoder shape:", encs.shape)
print("Decoder input shape:", decins.shape)
print("Decoder output shape:", decouts.shape)


# define tiny seq2seq model
# Reuse model definition from previous attempts, ensuring layer names match
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Reshape
from tensorflow.keras.models import Model

emb_dim=48; units=48 # Use the same dimensions as before

# Encoder
enc_input = Input(shape=(max_enc,), name='enc_input')
# Use the updated vocabulary size V
enc_emb_layer = Embedding(V, emb_dim, mask_zero=True, name='enc_embedding')
emb_e = enc_emb_layer(enc_input)
enc_lstm = LSTM(units, return_state=True, name='encoder_lstm')
_, sh, sc = enc_lstm(emb_e) # Don't need the full sequence output from encoder LSTM

# Decoder
dec_input = Input(shape=(max_dec,), name='dec_input')
# Use the updated vocabulary size V
dec_emb_layer = Embedding(V, emb_dim, mask_zero=True, name='dec_embedding')
emb_d = dec_emb_layer(dec_input)
# Add Reshape layer to explicitly set shape if needed (matching training structure)
# Check if this Reshape was truly necessary or caused warnings/errors before.
# Based on previous runs, the warning about mask loss suggests it might be
# better to remove if possible, but let's keep it for now to match the structure
# that successfully loaded weights previously.
# If the model doesn't train or predict correctly, this might be a place to revisit.
# Let's add the Reshape back as it was in the successful inference attempts.
reshaped_emb_d = Reshape((max_dec, emb_dim))(emb_d)

dec_lstm = LSTM(units, return_sequences=True, return_state=True, name='decoder_lstm')
# Initial state should be from the encoder
dec_outs, _, _ = dec_lstm(reshaped_emb_d, initial_state=[sh, sc])
# Use the updated vocabulary size V
dec_dense = Dense(V, activation='softmax', name='decoder_dense')
logits = dec_dense(dec_outs)

# Define the model
model = Model([enc_input, dec_input], logits)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Print model summary to verify structure
model.summary()


print("Training tiny model...")
# Train the model on the new data
# Adjust epochs and batch size if needed based on dataset size and performance
model.fit([encs, decins], decouts, batch_size=4, epochs=10, verbose=2) # Increased epochs slightly


# save artifacts
# Save weights after successful training
model.save_weights(str(MODELS / "seq2seq_weights.weights.h5"))
# SentencePiece model is already saved during the SentencePieceTrainer.Train call
print("Saved weights to", MODELS / "seq2seq_weights.weights.h5")
print("SPM vocab size:", sp.GetPieceSize())

Trained SPM: /content/bolly_chatbot/models/spm_bolly.model
SPM vocab size: 128
Prepared 2 encoded sequences.
Encoder shape: (2, 16)
Decoder input shape: (2, 15)
Decoder output shape: (2, 15, 1)




Training tiny model...
Epoch 1/10
1/1 - 4s - 4s/step - loss: 4.8507
Epoch 2/10
1/1 - 0s - 57ms/step - loss: 4.8441
Epoch 3/10
1/1 - 0s - 56ms/step - loss: 4.8374
Epoch 4/10
1/1 - 0s - 63ms/step - loss: 4.8304
Epoch 5/10
1/1 - 0s - 72ms/step - loss: 4.8230
Epoch 6/10
1/1 - 0s - 59ms/step - loss: 4.8150
Epoch 7/10
1/1 - 0s - 60ms/step - loss: 4.8062
Epoch 8/10
1/1 - 0s - 56ms/step - loss: 4.7966
Epoch 9/10
1/1 - 0s - 62ms/step - loss: 4.7858
Epoch 10/10
1/1 - 0s - 59ms/step - loss: 4.7737
Saved weights to /content/bolly_chatbot/models/seq2seq_weights.weights.h5
SPM vocab size: 128


In [44]:
# Cell 4 - Inference - Test with retrained model
# Reusing the inference code from previous attempts

# Load SentencePiece model
sp_prefix = str(MODELS / "spm_bolly")
sp = spm.SentencePieceProcessor()
sp.Load(sp_prefix + ".model")
V = sp.GetPieceSize()
print("Loaded SPM model from:", sp_prefix + ".model")
print("SPM vocab size:", V)

# Model parameters (should match training)
max_enc = 16
max_dec = 15
emb_dim = 48
units = 48

# Recreate the full model structure used during training to load weights
# Ensure layer names match exactly
# Encoder
enc_input_full = Input(shape=(max_enc,), name='enc_input')
# Use the updated vocabulary size V
enc_emb_layer_full = Embedding(V, emb_dim, mask_zero=True, name='enc_embedding')
emb_e_full = enc_emb_layer_full(enc_input_full)
enc_lstm_full = LSTM(units, return_state=True, name='encoder_lstm')
enc_outputs_full, sh_full, sc_full = enc_lstm_full(emb_e_full)

# Decoder
dec_input_full = Input(shape=(max_dec,), name='dec_input')
# Use the updated vocabulary size V
dec_emb_layer_full = Embedding(V, emb_dim, mask_zero=True, name='dec_embedding')
emb_d_full = dec_emb_layer_full(dec_input_full)
# Add Reshape layer as used in training
reshaped_emb_d_full = Reshape((max_dec, emb_dim))(emb_d_full)
dec_lstm_full = LSTM(units, return_sequences=True, return_state=True, name='decoder_lstm')
dec_outs_full, _, _ = dec_lstm_full(reshaped_emb_d_full, initial_state=[sh_full, sc_full])
dec_dense_full = Dense(V, activation='softmax', name='decoder_dense')
logits_full = dec_dense_full(dec_outs_full)

# Full model for loading weights
full_model = Model([enc_input_full, dec_input_full], logits_full)

# Load trained weights into the full model
full_model.load_weights(str(MODELS / "seq2seq_weights.weights.h5"))
print("Loaded model weights into full model from:", MODELS / "seq2seq_weights.weights.h5")


# Now define the encoder model for inference
enc_input = Input(shape=(max_enc,), name='enc_input')
enc_emb_layer = full_model.get_layer('enc_embedding')
emb_e = enc_emb_layer(enc_input)
enc_lstm = full_model.get_layer('encoder_lstm')
enc_outputs, state_h, state_c = enc_lstm(emb_e)
encoder_model = Model(enc_input, [state_h, state_c])
print("Defined encoder model for inference")

# Define the decoder model for inference
dec_input = Input(shape=(1,), name='dec_input') # Decoder input is one token at a time
dec_state_h = Input(shape=(units,), name='dec_state_h_input')
dec_state_c = Input(shape=(units,), name='dec_state_c_input')
dec_states_inputs = [dec_state_h, dec_state_c]

dec_emb_layer = full_model.get_layer('dec_embedding') # Reuse embedding layer
emb_d = dec_emb_layer(dec_input)

dec_lstm = full_model.get_layer('decoder_lstm') # Reuse LSTM layer
dec_outputs, state_h_out, state_c_out = dec_lstm(emb_d, initial_state=dec_states_inputs)
dec_states_outputs = [state_h_out, state_c_out]

dec_dense = full_model.get_layer('decoder_dense') # Reuse dense layer
output_tokens = dec_dense(dec_outputs)

decoder_model = Model([dec_input] + dec_states_inputs, [output_tokens] + dec_states_outputs)
print("Defined decoder model for inference")


# Define the inference function
def generate_response(input_sentence, max_length=max_dec):
    # Preprocess the input sentence
    input_seq = sp.EncodeAsIds("<s> " + input_sentence + " </s>")
    input_seq = pad_sequences([input_seq], maxlen=max_enc, padding='post')

    # Get the initial states from the encoder
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Start the decoder with the start token
    start_token = sp.PieceToId("<s>")
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token

    # Sampling loop for a batch of sequences
    decoded_sentence = []
    for _ in range(max_length):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)

        # Sample a token (greedy approach for simplicity)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Add a check to ensure the sampled index is within the valid range
        if sampled_token_index < 0 or sampled_token_index >= V:
            print(f"Warning: Sampled token index {sampled_token_index} is out of vocabulary range [0, {V-1}]. Stopping generation.")
            break

        # Explicitly cast to int just in case
        sampled_token = sp.IdToPiece(int(sampled_token_index))

        # Exit condition: hitting stop character or max length
        if sampled_token == "</s>":
            break

        # Append token
        decoded_sentence.append(sampled_token)

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    # Join the tokens to form the sentence, handling SentencePiece subwords
    return "".join(decoded_sentence).replace(" ", " ").strip()

# Test the inference function
print("\nTesting inference:")
# Use sample inputs relevant to the (potentially new) dataset
sample_inputs = ["क्या तुम मेरे साथ चलोगी?", "मुझे माफ कर दो", "hello", "how are you?"]
for sample_input in sample_inputs:
    response = generate_response(sample_input)
    print(f"Input: {sample_input}\nResponse: {response}\n")

Loaded SPM model from: /content/bolly_chatbot/models/spm_bolly.model
SPM vocab size: 128
Loaded model weights into full model from: /content/bolly_chatbot/models/seq2seq_weights.weights.h5
Defined encoder model for inference
Defined decoder model for inference

Testing inference:




Input: क्या तुम मेरे साथ चलोगी?
Response: <s><s><s><s><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>

Input: मुझे माफ कर दो
Response: <s><s><s><s>▁मैं▁हूँ<unk><unk><unk><unk><unk><unk><unk><unk><unk>

Input: hello
Response: <s><s><s><s>▁मैं▁हूँ<unk><unk><unk><unk><unk><unk><unk><unk><unk>

Input: how are you?
Response: <s><s><s><s><s><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>



In [45]:
# Cell 8 - Train new SentencePiece model for combined Hinglish/English corpus
from pathlib import Path
import sentencepiece as spm
import os

# Define paths - assuming corpus.txt now contains combined data from previous step
BASE = Path("/content/bolly_chatbot")
MODELS = BASE / "models"
CORPUS = BASE / "data" / "corpus.txt" # Ensure this points to the combined corpus file

# Ensure the models directory exists
MODELS.mkdir(exist_ok=True)

# Define output path and prefix for the new SentencePiece model
sp_prefix = str(MODELS / "spm_bolly_combined") # New prefix for the larger model

# Define a larger vocabulary size
new_vocab_size = 2000 # Or 4000, or more, depending on dataset size

# Train SentencePiece (larger vocab for combined data)
print(f"Training SentencePiece model with vocab size {new_vocab_size} on {CORPUS}...")

# Ensure SentencePiece model is trained only if it doesn't exist
if not (Path(sp_prefix + ".model")).exists():
    try:
        spm.SentencePieceTrainer.Train(
            f"--input={CORPUS} "
            f"--model_prefix={sp_prefix} "
            f"--vocab_size={new_vocab_size} "
            f"--character_coverage=0.9995 "
            f"--model_type=bpe "
            f"--user_defined_symbols=<s>,</s>"
        )
        print("Trained SPM:", sp_prefix + ".model")
        print("SPM vocabulary size:", spm.SentencePieceProcessor().Load(sp_prefix + ".model").GetPieceSize())
    except Exception as e:
        print(f"Error during SentencePiece training: {e}")
else:
    print("SentencePiece model already exists:", sp_prefix + ".model")

Training SentencePiece model with vocab size 2000 on /content/bolly_chatbot/data/corpus.txt...
SentencePiece model already exists: /content/bolly_chatbot/models/spm_bolly_combined.model


In [46]:
# Cell 9 - Train seq2seq model on new combined data

# Define paths - assuming corpus.txt is updated and spm_bolly_combined.* exist
BASE = Path("/content/bolly_chatbot")
DATA = BASE / "data"
MODELS = BASE / "models"
CORPUS = DATA / "corpus.txt" # Should point to the combined corpus

# Load the new SentencePiece model
sp_prefix_combined = str(MODELS / "spm_bolly_combined")
sp = spm.SentencePieceProcessor()
sp.Load(sp_prefix_combined + ".model")
V = sp.GetPieceSize() # Get vocabulary size from the new model
print("Loaded SPM model from:", sp_prefix_combined + ".model")
print("SPM vocabulary size:", V)


# Model parameters (can keep same as tiny model for now, adjust if needed)
max_enc = 16
max_dec = 15
emb_dim = 48
units = 48

print(f"Model parameters: max_enc={max_enc}, max_dec={max_dec}, emb_dim={emb_dim}, units={units}, vocab_size={V}")

# Load the conversation pairs from the combined corpus
lines = []
# Re-read the corpus to get lines
with open(CORPUS, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

# Rebuild pairs from lines - assuming corpus.txt is line-by-line
pairs = []
for i in range(0, len(lines), 2):
    if i + 1 < len(lines):
        pairs.append((lines[i], lines[i+1]))

print("Pairs loaded from corpus:", len(pairs))
if len(pairs) == 0:
    raise SystemExit("No pairs found in corpus.txt. Ensure the file is correctly formatted.")


# Encode the conversation pairs
def encode(s, maxlen):
    txt = "<s> " + s + " </s>"
    encoded_ids = sp.EncodeAsIds(txt)
    # Truncate if longer than maxlen
    return encoded_ids[:maxlen]

encs = []
decins = []
decouts = []

for a, b in pairs:
    e = encode(a, max_enc)
    # Decoder input needs one less token than decoder output for shifted sequence
    d_in = encode(b, max_dec)
    d_out = encode(b, max_dec) # Max length for output sequence

    # Ensure sequence is not empty after encoding/truncation
    if len(e) == 0 or len(d_in) < 1 or len(d_out) < 1:
         continue

    # Shift decoder output by one token
    # Decoder input sequence should not contain the </s> token
    # Decoder output sequence should not contain the <s> token
    # Pad d_in and d_out to max_dec
    encs.append(e)
    decins.append(d_in)
    decouts.append(d_out)


# Pad sequences
encs = pad_sequences(encs, maxlen=max_enc, padding='post')
decins = pad_sequences(decins, maxlen=max_dec, padding='post')
decouts = pad_sequences(decouts, maxlen=max_dec, padding='post')

# Reshape decouts for sparse_categorical_crossentropy
decouts = np.expand_dims(decouts, -1)

print(f"Encoded samples: encs shape={encs.shape}, decins shape={decins.shape}, decouts shape={decouts.shape}")


# Define the seq2seq model architecture (ensure layer names match previous training)
# Encoder
enc_input = Input(shape=(max_enc,), name='enc_input')
enc_emb_layer = Embedding(V, emb_dim, mask_zero=True, name='enc_embedding')
emb_e = enc_emb_layer(enc_input)
enc_lstm = LSTM(units, return_state=True, name='encoder_lstm')
_, sh, sc = enc_lstm(emb_e)

# Decoder
dec_input = Input(shape=(max_dec,), name='dec_input')
dec_emb_layer = Embedding(V, emb_dim, mask_zero=True, name='dec_embedding')
emb_d = dec_emb_layer(dec_input)
# Add Reshape layer to explicitly set shape to (max_dec, emb_dim)
reshaped_emb_d = Reshape((max_dec, emb_dim))(emb_d)
dec_lstm = LSTM(units, return_sequences=True, return_state=True, name='decoder_lstm')
dec_outs, _, _ = dec_lstm(reshaped_emb_d, initial_state=[sh, sc])
dec_dense = Dense(V, activation='softmax', name='decoder_dense')
logits = dec_dense(dec_outs)

model_combined = Model([enc_input, dec_input], logits)

# Compile the model
model_combined.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

print("Training seq2seq model on combined data...")
# Train the model
# Use a larger number of epochs and batch size if the dataset is significantly larger
history = model_combined.fit([encs, decins], decouts, batch_size=32, epochs=10, verbose=1)

# Save the trained model's weights
weights_path_combined = MODELS / "seq2seq_combined_weights.weights.h5"
model_combined.save_weights(str(weights_path_combined))
print("Saved trained model weights to:", weights_path_combined)

Loaded SPM model from: /content/bolly_chatbot/models/spm_bolly_combined.model
SPM vocabulary size: 128
Model parameters: max_enc=16, max_dec=15, emb_dim=48, units=48, vocab_size=128
Pairs loaded from corpus: 2
Encoded samples: encs shape=(2, 16), decins shape=(2, 15), decouts shape=(2, 15, 1)
Training seq2seq model on combined data...
Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - loss: 4.8510
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - loss: 4.8446
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - loss: 4.8380
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - loss: 4.8311
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - loss: 4.8236
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - loss: 4.8155
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - loss: 4.8066
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - loss: 4.7966
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - loss: 4.7852
Epoch 10/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - loss: 4.7723
Saved trained model weights to: 

In [47]:
# Cell 10 - Inference with the new combined model
from pathlib import Path
import numpy as np
import sentencepiece as spm
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define paths
BASE = Path("/content/bolly_chatbot")
MODELS = BASE / "models"

# Load the newly trained SentencePiece model (step 1)
sp_prefix_combined = str(MODELS / "spm_bolly_combined")
sp = spm.SentencePieceProcessor()
sp.Load(sp_prefix_combined + ".model")
V = sp.GetPieceSize() # Get vocabulary size from the new model
print("Loaded SPM model from:", sp_prefix_combined + ".model")
print("SPM vocabulary size:", V)

# Model parameters (should match combined model training)
max_enc = 16
max_dec = 15
emb_dim = 48
units = 48

# Recreate the full model structure used during combined model training to load weights (step 2)
# Ensure layer names match exactly
# Encoder
enc_input_full = Input(shape=(max_enc,), name='enc_input')
# Use the updated vocabulary size V
enc_emb_layer_full = Embedding(V, emb_dim, mask_zero=True, name='enc_embedding')
emb_e_full = enc_emb_layer_full(enc_input_full)
enc_lstm_full = LSTM(units, return_state=True, name='encoder_lstm')
enc_outputs_full, sh_full, sc_full = enc_lstm_full(emb_e_full)

# Decoder
dec_input_full = Input(shape=(max_dec,), name='dec_input')
# Use the updated vocabulary size V
dec_emb_layer_full = Embedding(V, emb_dim, mask_zero=True, name='dec_embedding')
emb_d_full = dec_emb_layer_full(dec_input_full)
# Add Reshape layer to explicitly set shape to (max_dec, emb_dim)
reshaped_emb_d_full = Reshape((max_dec, emb_dim))(emb_d_full)
dec_lstm_full = LSTM(units, return_sequences=True, return_state=True, name='decoder_lstm')
dec_outs_full, _, _ = dec_lstm_full(reshaped_emb_d_full, initial_state=[sh_full, sc_full])
dec_dense_full = Dense(V, activation='softmax', name='decoder_dense')
logits_full = dec_dense_full(dec_outs_full)

# Full model for loading weights
full_model_combined = Model([enc_input_full, dec_input_full], logits_full)

# Load trained weights into the full model (step 2)
weights_path_combined = MODELS / "seq2seq_combined_weights.weights.h5"
full_model_combined.load_weights(str(weights_path_combined))
print("Loaded model weights into full model from:", weights_path_combined)


# Define the encoder model for inference (step 3)
enc_input = Input(shape=(max_enc,), name='enc_input')
enc_emb_layer = full_model_combined.get_layer('enc_embedding')
emb_e = enc_emb_layer(enc_input)
enc_lstm = full_model_combined.get_layer('encoder_lstm')
enc_outputs, state_h, state_c = enc_lstm(emb_e)
encoder_model_combined = Model(enc_input, [state_h, state_c])
print("Defined encoder model for inference")

# Define the decoder model for inference (step 3)
dec_input = Input(shape=(1,), name='dec_input') # Decoder input is one token at a time
dec_state_h = Input(shape=(units,), name='dec_state_h_input')
dec_state_c = Input(shape=(units,), name='dec_state_c_input')
dec_states_inputs = [dec_state_h, dec_state_c]

dec_emb_layer = full_model_combined.get_layer('dec_embedding') # Reuse embedding layer
emb_d = dec_emb_layer(dec_input)

dec_lstm = full_model_combined.get_layer('decoder_lstm') # Reuse LSTM layer
dec_outputs, state_h_out, state_c_out = dec_lstm(emb_d, initial_state=dec_states_inputs)
dec_states_outputs = [state_h_out, state_c_out]

dec_dense = full_model_combined.get_layer('decoder_dense') # Reuse dense layer
output_tokens = dec_dense(dec_outputs)

decoder_model_combined = Model([dec_input] + dec_states_inputs, [output_tokens] + dec_states_outputs)
print("Defined decoder model for inference")

# Define the inference function (step 4)
def generate_response_combined(input_sentence, max_length=max_dec):
    # Preprocess the input sentence using the combined SentencePiece model
    input_seq = sp.EncodeAsIds("<s> " + input_sentence + " </s>")
    input_seq = pad_sequences([input_seq], maxlen=max_enc, padding='post')

    # Get the initial states from the encoder (using the combined encoder model)
    states_value = encoder_model_combined.predict(input_seq, verbose=0)

    # Start the decoder with the start token
    start_token = sp.PieceToId("<s>")
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token

    # Sampling loop for a batch of sequences
    decoded_sentence = []
    for _ in range(max_length):
        # Predict the next token (using the combined decoder model)
        output_tokens, h, c = decoder_model_combined.predict([target_seq] + states_value, verbose=0)

        # Sample a token (greedy approach for simplicity)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Add a check to ensure the sampled index is within the valid range
        if sampled_token_index < 0 or sampled_token_index >= V:
            print(f"Warning: Sampled token index {sampled_token_index} is out of vocabulary range [0, {V-1}]. Stopping generation.")
            break

        # Explicitly cast to int just in case and convert ID to piece
        sampled_token = sp.IdToPiece(int(sampled_token_index))

        # Exit condition: hitting stop character or max length
        if sampled_token == "</s>":
            break

        # Append token
        decoded_sentence.append(sampled_token)

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    # Join the tokens to form the sentence, handling SentencePiece subwords
    return "".join(decoded_sentence).replace(" ", " ").strip()

# Test the inference function with relevant inputs
print("\nTesting inference with combined model:")
sample_inputs = ["क्या तुम मेरे साथ चलोगी?", "मुझे माफ कर दो", "hello", "how are you?"] # Include English/Hinglish examples
for sample_input in sample_inputs:
    response = generate_response_combined(sample_input)
    print(f"Input: {sample_input}\nResponse: {response}\n")

Loaded SPM model from: /content/bolly_chatbot/models/spm_bolly_combined.model
SPM vocabulary size: 128
Loaded model weights into full model from: /content/bolly_chatbot/models/seq2seq_combined_weights.weights.h5
Defined encoder model for inference
Defined decoder model for inference

Testing inference with combined model:




Input: क्या तुम मेरे साथ चलोगी?
Response: ▁▁▁▁▁▁▁▁<unk><unk><unk><unk><unk><unk><unk>

Input: मुझे माफ कर दो
Response: ▁▁▁▁▁▁▁▁<unk><unk><unk><unk><unk><unk><unk>

Input: hello
Response: ▁▁▁▁▁▁▁▁<unk><unk><unk><unk><unk><unk><unk>

Input: how are you?
Response: ▁▁▁▁▁▁▁▁<unk><unk><unk><unk><unk><unk><unk>

