# Chatbot Project with Seq2Seq Model

## Step 1: Install and Import Libraries

In [7]:
#%pip install tensorflow

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras.layers import Input, LSTM, Embedding, Dense
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

print("Libraries imported successfully!")

Libraries imported successfully!


## Step 2: Load and Preprocess Data

In [3]:
# Load the conversation pairs
conversation_pairs = pd.read_csv('data/processed_conversation_pairs.csv')
conversation_pairs['input'] = conversation_pairs['input'].fillna('')
conversation_pairs['output'] = conversation_pairs['output'].fillna('')

#### Load your conversation pairs

In [3]:
conversation_pairs_df = pd.read_csv('data/processed_conversation_pairs.csv')
conversation_pairs_df['input'] = conversation_pairs_df['input'].fillna('')
conversation_pairs_df['output'] = conversation_pairs_df['output'].fillna('')

In [4]:
# Define constants
VOCAB_SIZE = 5000
MAX_LEN = 30
EMBEDDING_DIM = 300
LATENT_DIM = 256

### Data Preprocessing Module

In [5]:
def preprocess_text(conversation_pairs, vocab_size, max_len):
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(conversation_pairs['input'] + conversation_pairs['output'])

    # Convert text to sequences
    input_sequences = tokenizer.texts_to_sequences(conversation_pairs['input'])
    output_sequences = tokenizer.texts_to_sequences(conversation_pairs['output'])

    # Pad sequences
    input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='post')
    output_sequences = pad_sequences(output_sequences, maxlen=max_len, padding='post')

    return input_sequences, output_sequences, tokenizer

#### Preprocess the data

In [8]:
input_sequences, output_sequences, tokenizer = preprocess_text(conversation_pairs, VOCAB_SIZE, MAX_LEN)

#### Split into training and validation sets

In [9]:
X_train, X_val, y_train, y_val = train_test_split(input_sequences, output_sequences, test_size=0.2, random_state=42)

print("Data preprocessing complete!")

Data preprocessing complete!


## Step 3: Build the Seq2Seq Model

In [10]:
def build_seq2seq_model(vocab_size, embedding_dim, latent_dim, max_len):
    # Encoder
    encoder_inputs = Input(shape=(max_len,))
    encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
    encoder_lstm = LSTM(latent_dim, return_state=True)
    _, state_h, state_c = encoder_lstm(encoder_embedding)

    # Decoder
    decoder_inputs = Input(shape=(max_len,))
    decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
    decoder_dense = Dense(vocab_size, activation='softmax')
    output = decoder_dense(decoder_outputs)

    # Compile the model
    model = Model([encoder_inputs, decoder_inputs], output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

### Build the model

In [11]:
model = build_seq2seq_model(VOCAB_SIZE, EMBEDDING_DIM, LATENT_DIM, MAX_LEN)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 30)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 30)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 30, 300)      1500000     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 30, 300)      1500000     ['input_2[0][0]']                
                                                                                              

## Step 4: Train the Model

In [13]:
def train_model(model, X_train, y_train, X_val, y_val, batch_size=64, epochs=10):
    history = model.fit(
        [X_train, X_train], y_train,  # Feeding input and output for training
        validation_data=([X_val, X_val], y_val),
        batch_size=batch_size,
        epochs=epochs
    )
    return history

### Train the model

In [None]:
history = train_model(model, X_train, y_train, X_val, y_val)

print("Model training complete!")

Epoch 1/10

## Step 5: Evaluate and Save the Model

In [None]:
def save_model(model, model_path):
    model.save(model_path)
    print(f"Model saved to {model_path}")

### Save the trained model

In [None]:
save_model(model, 'chatbot_seq2seq_model.h5')

## Step 6: Interact with the Chatbot

In [None]:
def generate_response(input_text, model, tokenizer, max_len):
    input_sequence = tokenizer.texts_to_sequences([input_text])
    input_sequence = pad_sequences(input_sequence, maxlen=max_len, padding='post')

    # Predict the output sequence
    decoded_sequence = model.predict([input_sequence, input_sequence])
    predicted_sequence = np.argmax(decoded_sequence[0], axis=-1)

    # Convert back to text
    response = ' '.join([tokenizer.index_word[idx] for idx in predicted_sequence if idx > 0])
    return response

### Query

In [None]:
user_input = "Hello! How are you?"
response = generate_response(user_input, model, tokenizer, MAX_LEN)
print(f"Bot: {response}")