In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

A. Basic Rule -based ChatBot using Python NLTK 

In [1]:

# Step 2: Import Libraries
import nltk
from nltk.chat.util import Chat, reflections

# Step 3: Define Rules (Predefined pairs)
pairs = [
    (r"my name is (.*)", ["Hello %1, How are you today?"]),
    (r"hi|hey|hello", ["Hello", "Hey there"]),
    (r"what is your name?", ["I am a bot created by [Your Name]."]),
    (r"how are you?", ["I'm doing good. How about you?"]),
    (r"sorry (.*)", ["No problem", "It's okay", "You don't need to be sorry"]),
    (r"quit", ["Bye! Take care."])
]

# Step 4: Create the Chatbot
def chatbot():
    print("Hi, I'm the chatbot you created. Type 'quit' to exit.") 
    chat = Chat(pairs, reflections)
    chat.converse()
    
# Step 5: Run the Chatbot
if __name__ == "__main__":
    chatbot()


Hi, I'm the chatbot you created. Type 'quit' to exit.


> My name is ramya


Hello ramya, How are you today?


> I'm doing good


None


> sorry


None


> quit


Bye! Take care.


B. Building a Chatbot using seq2Seq models 

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def load_data(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            lines = f.readlines()
    except Exception as e:
        print(f"Error reading the file: {e}")
        return []

    conversations = []
    for line in lines:
        line_parts = line.strip().split(' +++$+++ ')
        if len(line_parts) == 5:
            conversations.append(line_parts[4])  # Store only the dialogue part

    print(f"Loaded {len(conversations)} conversations.")  # Debug info
    return conversations

def create_pairs(conversations):
    input_texts = []
    target_texts = []

    for i in range(len(conversations) - 1):
        input_text = conversations[i]
        target_text = conversations[i + 1]
        target_text = '\t' + target_text + '\n'  # Add start and end tokens
        input_texts.append(input_text)
        target_texts.append(target_text)

    print(f"Created {len(input_texts)} input-target pairs.")  # Debug info
    return input_texts, target_texts

conversations = load_data('/kaggle/input/movie-dialogs/movie_lines.txt')  # Make sure this file exists
input_texts, target_texts = create_pairs(conversations)

if not input_texts or not target_texts:
    raise ValueError("No input or target texts were created. Please check the dataset.")

input_tokenizer = Tokenizer()
target_tokenizer = Tokenizer()

input_tokenizer.fit_on_texts(input_texts)
target_tokenizer.fit_on_texts(target_texts)

input_sequences = input_tokenizer.texts_to_sequences(input_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)

max_encoder_seq_length = max(len(seq) for seq in input_sequences) if input_sequences else 0
max_decoder_seq_length = max(len(seq) for seq in target_sequences) if target_sequences else 0

encoder_input_data = pad_sequences(input_sequences, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(target_sequences, maxlen=max_decoder_seq_length, padding='post')

decoder_output_data = np.zeros((len(target_sequences), max_decoder_seq_length, len(target_tokenizer.word_index) + 1), dtype='float32')

for i, seq in enumerate(target_sequences):
    for t, word_idx in enumerate(seq):
        if t > 0:
            decoder_output_data[i, t - 1, word_idx] = 1.0

num_encoder_tokens = len(input_tokenizer.word_index) + 1
num_decoder_tokens = len(target_tokenizer.word_index) + 1

encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=num_encoder_tokens, output_dim=256)(encoder_inputs)
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=num_decoder_tokens, output_dim=256)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit([encoder_input_data, decoder_input_data], decoder_output_data, batch_size=64, epochs=100)

encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

def decode_sequence(input_seq):
    
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_tokenizer.word_index['\t']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample the next token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = target_tokenizer.index_word.get(sampled_token_index, '')
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence and states
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()  # Trim any extra whitespace

def chat():
    print("Chatbot is ready! Type 'quit' to exit.")
    while True:
        input_text = input("You: ")
        if input_text.lower() == 'quit':
            print("Exiting the chat. Goodbye!")
            break

        input_sequence = input_tokenizer.texts_to_sequences([input_text])
        input_sequence = pad_sequences(input_sequence, maxlen=max_encoder_seq_length, padding='post')
        response = decode_sequence(input_sequence)
        print(f"Bot: {response}")

if __name__ == "__main__":
    chat()


Loaded 304446 conversations.
Created 304445 input-target pairs.


C. Conversational AI with Transformer Based models 

In [3]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Load your dataset (example with a CSV file)
data = pd.read_csv('/kaggle/input/opensubtitles-6-languages-for-jigsaw-2020/opensub/opensub_fr.csv')  # Replace with your dataset path
conversations = data[['en', 'lang_text']]  # Adjust column names based on your dataset

# Initialize the tokenizer and model (using GPT-2 for this example)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Data Preprocessing
def preprocess_data(conversations):
    inputs = ["Question: " + q for q in conversations['en'].tolist()]
    outputs = ["Answer: " + a for a in conversations['lang_text'].tolist()]
    return inputs, outputs

inputs, outputs = preprocess_data(conversations)

# Create input-output pairs for training
train_data = list(zip(inputs, outputs))

# Tokenization
train_encodings = tokenizer(inputs, truncation=True, padding=True, max_length=50)
train_labels = tokenizer(outputs, truncation=True, padding=True, max_length=50)

# Prepare dataset for Trainer
class ChatDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])

# Create dataset object
train_dataset = ChatDataset(train_encodings, train_labels)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Generating responses
def generate_response(question):
    input_ids = tokenizer.encode("Question: " + question, return_tensors='pt')
    output = model.generate(input_ids, max_length=50, num_return_sequences=1)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response.replace("Question: ", "").replace("Answer: ", "").strip()

# Sample Input and Output
sample_input = "What's the weather like today?"
expected_output = generate_response(sample_input)

print(f"Input: {sample_input}")
print(f"Expected Output: {expected_output}")



ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.