In [6]:
# -*- coding: utf-8 -*-
"""
Advanced Chatbot with LSTM (Google Colab Version)
"""
# Install required packages
!pip install gradio
!wget https://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
!unzip cornell_movie_dialogs_corpus.zip

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import re
import gradio as gr

# Configuration
MAX_SAMPLES = 100000  # Reduce if memory issues
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 256
LSTM_UNITS = 512
BATCH_SIZE = 64
EPOCHS = 30
MAX_LENGTH = 20

# Load and process dataset
def load_conversations():
    lines = open('cornell movie-dialogs corpus/movie_lines.txt',
                encoding='utf-8', errors='ignore').read().split('\n')
    conv_lines = open('cornell movie-dialogs corpus/movie_conversations.txt',
                     encoding='utf-8', errors='ignore').read().split('\n')

    # Create dictionary of line IDs to text
    id2line = {}
    for line in lines:
        parts = line.split(' +++$+++ ')
        if len(parts) == 5:
            id2line[parts[0]] = parts[4]

    # Create list of conversations
    conversations = []
    for conv in conv_lines[:-1]:
        parts = conv.split(' +++$+++ ')[-1][1:-1].replace("'", "").split(', ')
        conversations.append(parts)

    # Create question-answer pairs
    questions = []
    answers = []
    for conv in conversations:
        for i in range(len(conv)-1):
            questions.append(id2line[conv[i]])
            answers.append(id2line[conv[i+1]])
            if len(questions) >= MAX_SAMPLES:
                return questions, answers
    return questions, answers

questions, answers = load_conversations()

# Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    return text

questions = [preprocess_text(q) for q in questions]
answers = [preprocess_text(a) for a in answers]

# Add start and end tokens to answers
answers = ['<start> ' + a + ' <end>' for a in answers]

# Tokenization
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='', oov_token='<OOV>')
tokenizer.fit_on_texts(questions + answers)

# Sequence conversion and padding
question_sequences = tokenizer.texts_to_sequences(questions)
answer_sequences = tokenizer.texts_to_sequences(answers)

question_sequences = pad_sequences(question_sequences, maxlen=MAX_LENGTH, padding='post')
answer_sequences = pad_sequences(answer_sequences, maxlen=MAX_LENGTH+2, padding='post')  # +2 for start/end tokens

# Prepare decoder input and output
decoder_input_data = answer_sequences[:, :-1]
decoder_target_data = answer_sequences[:, 1:]

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(MAX_VOCAB_SIZE, EMBEDDING_DIM)(encoder_inputs)
enc_lstm = Bidirectional(LSTM(LSTM_UNITS, return_state=True))
_, forward_h, forward_c, backward_h, backward_c = enc_lstm(enc_emb)
state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(MAX_VOCAB_SIZE, EMBEDDING_DIM)
dec_emb_output = dec_emb(decoder_inputs)

decoder_lstm = LSTM(LSTM_UNITS*2, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb_output, initial_state=encoder_states)

decoder_dense = Dense(MAX_VOCAB_SIZE, activation='softmax')
output = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


Collecting gradio
  Downloading gradio-5.23.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 

In [7]:
EPOCHS=20

In [8]:
# Training
history = model.fit(
    [question_sequences, decoder_input_data],
    np.expand_dims(decoder_target_data, -1),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2
)


Epoch 1/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m298s[0m 234ms/step - accuracy: 0.5815 - loss: 2.9697 - val_accuracy: 0.6216 - val_loss: 2.4174
Epoch 2/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 237ms/step - accuracy: 0.6221 - loss: 2.3189 - val_accuracy: 0.6291 - val_loss: 2.3404
Epoch 3/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 252ms/step - accuracy: 0.6300 - loss: 2.1759 - val_accuracy: 0.6324 - val_loss: 2.3193
Epoch 4/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m303s[0m 237ms/step - accuracy: 0.6380 - loss: 2.0404 - val_accuracy: 0.6337 - val_loss: 2.3275
Epoch 5/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 253ms/step - accuracy: 0.6431 - loss: 1.9191 - val_accuracy: 0.6335 - val_loss: 2.3554
Epoch 6/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 253ms/step - accuracy: 0.6530 - loss: 1.7870 - val_accuracy: 0.6321 - val_loss:

In [9]:
# Inference setup
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(LSTM_UNITS*2,))
decoder_state_input_c = Input(shape=(LSTM_UNITS*2,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(
    dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

# Chat functions
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['<start>']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word.get(sampled_token_index, '<OOV>')

        if sampled_word != '<end>' and len(decoded_sentence.split()) < MAX_LENGTH:
            decoded_sentence += ' ' + sampled_word

        if sampled_word == '<end>' or len(decoded_sentence.split()) >= MAX_LENGTH:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()

def respond(message):
    message = preprocess_text(message)
    seq = tokenizer.texts_to_sequences([message])
    padded = pad_sequences(seq, maxlen=MAX_LENGTH, padding='post')
    response = decode_sequence(padded)
    return response


In [11]:
import pickle
import os
from tensorflow.keras.models import save_model

# Create a directory to save all model files
model_dir = "saved_chatbot_model"
os.makedirs(model_dir, exist_ok=True)

# 1. Save the main seq2seq model
save_model(model, os.path.join(model_dir, 'chatbot_model.h5'))

# 2. Save the encoder model separately
save_model(encoder_model, os.path.join(model_dir, 'encoder_model.h5'))

# 3. Save the decoder model separately
save_model(decoder_model, os.path.join(model_dir, 'decoder_model.h5'))

# 4. Save the tokenizer
with open(os.path.join(model_dir, 'tokenizer.pickle'), 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# 5. Save the configuration
config = {
    'MAX_VOCAB_SIZE': MAX_VOCAB_SIZE,
    'EMBEDDING_DIM': EMBEDDING_DIM,
    'LSTM_UNITS': LSTM_UNITS,
    'MAX_LENGTH': MAX_LENGTH
}

with open(os.path.join(model_dir, 'config.pickle'), 'wb') as handle:
    pickle.dump(config, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(f"All model components saved successfully in '{model_dir}' directory")



All model components saved successfully in 'saved_chatbot_model' directory


In [10]:
# Create Gradio interface
iface = gr.Interface(
    fn=respond,
    inputs=gr.Textbox(lines=2, placeholder="Type your message here..."),
    outputs="text",
    title="Advanced Chatbot",
    description="An LSTM-based chatbot trained on movie dialogues"
)

iface.launch(debug=True)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://713842ce73b1252fe2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

