# Import Library

In [1]:
import numpy as py
import pandas as pd
import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM, Dense, LayerNormalization
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Masking

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

from tensorflow.keras.models import load_model 
from tensorflow.keras.preprocessing.sequence import pad_sequences

import random
import os
import pickle
import re

E0000 00:00:1731944664.037317    1153 common_lib.cc:798] Could not set metric server port: INVALID_ARGUMENT: Could not find SliceBuilder port 8471 in any of the 0 ports provided in `tpu_process_addresses`="local"
=== Source Location Trace: === 
learning/45eac/tfrc/runtime/common_lib.cc:479
D1118 15:44:24.046089849    1153 config.cc:196]                        gRPC EXPERIMENT call_status_override_on_cancellation   OFF (default:OFF)
D1118 15:44:24.046133545    1153 config.cc:196]                        gRPC EXPERIMENT call_v3                                OFF (default:OFF)
D1118 15:44:24.046141095    1153 config.cc:196]                        gRPC EXPERIMENT canary_client_privacy                  ON  (default:ON)
D1118 15:44:24.046145327    1153 config.cc:196]                        gRPC EXPERIMENT capture_base_context                   ON  (default:ON)
D1118 15:44:24.046147971    1153 config.cc:196]                        gRPC EXPERIMENT client_idleness                        ON  (defa

# Cleaning Data

In [2]:
#Load Data
df_medquad = pd.read_csv('/kaggle/input/layoutlm/medquad.csv')
df_data = pd.read_csv('/kaggle/input/medquad-dataset/ProcessedData.csv')
df_care = pd.read_csv('/kaggle/input/healthcare/train.csv')

#Renaming column
df_data.rename(columns={'Questions': 'question'}, inplace=True)
df_data.rename(columns={'Answers': 'answer'}, inplace=True)
df_care.rename(columns={'Question': 'question'}, inplace=True)
df_care.rename(columns={'Answer': 'answer'}, inplace=True)

#Delete unused column
df_medquad.drop('source', axis=1, inplace=True)
df_medquad.drop('focus_area', axis=1, inplace=True)
df_data.drop('Focus', axis=1, inplace=True)
df_care.drop('qtype', axis=1, inplace=True)

df = pd.concat([df_medquad, df_data, df_care], ignore_index=True)

#Sampel Data
print("Data Sample")
print(df.head())

#Null value
print("Null Value Data")
print(df.isnull().sum())

# Check for duplicate rows
duplicates = df.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

# Remove duplicate rows
df = df.drop_duplicates()

# Reset the index after removing duplicates
df.reset_index(drop=True, inplace=True)

#Table Info
print("Table Info")
print(df.info())

#Drop rows with null values
df.dropna(inplace=True)

#Checking of null values
print("Null Value Data")
print(df.isnull().sum())

Data Sample
                                 question  \
0                What is (are) Glaucoma ?   
1                  What causes Glaucoma ?   
2     What are the symptoms of Glaucoma ?   
3  What are the treatments for Glaucoma ?   
4                What is (are) Glaucoma ?   

                                              answer  
0  Glaucoma is a group of diseases that can damag...  
1  Nearly 2.7 million people have glaucoma, a lea...  
2  Symptoms of Glaucoma  Glaucoma can develop in ...  
3  Although open-angle glaucoma cannot be cured, ...  
4  Glaucoma is a group of diseases that can damag...  
Null Value Data
question    0
answer      5
dtype: int64
Number of duplicate rows: 20418
Table Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28803 entries, 0 to 28802
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  28803 non-null  object
 1   answer    28798 non-null  object
dtypes: object(2)
memory us

# Data Augmentation

In [3]:
def synonym_replacement(text, n=2):
    words = text.split()
    new_words = words.copy()
    random_word_list = list(set(words))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if synonyms:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:  
            break
    return ' '.join(new_words)

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

def random_deletion(text, p=0.2):
    words = text.split()
    if len(words) == 1:
        return text

    new_words = []
    for word in words:
        if random.uniform(0, 1) > p:
            new_words.append(word)
    if len(new_words) == 0:  
        return random.choice(words)
    return ' '.join(new_words)


# Architecting Model

In [4]:
# Data Preprocessing
def preprocess_text(text):
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)  
    return text

df['question'] = df['question'].apply(preprocess_text)
df['answer'] = df['answer'].apply(preprocess_text)

# Truncate each answer to only the first sentence (improved method)
df['answer'] = df['answer'].apply(lambda x: x.split('.')[0] + '.' if '.' in x else x)

# Add start and end tokens to answers
df['answer'] = df['answer'].apply(lambda x: f"<start> {x} <end>")

# Train-test split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_questions = train_df['question'].tolist()
train_answers = train_df['answer'].tolist()
val_questions = val_df['question'].tolist()
val_answers = val_df['answer'].tolist()

# Tokenization
max_vocab_size = 50000
question_tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>", filters='', lower=True)
answer_tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>", filters='', lower=True)

question_tokenizer.fit_on_texts(train_questions)
answer_tokenizer.fit_on_texts(train_answers)

question_tokenizer.word_index = {k: (v + 1) for k, v in question_tokenizer.word_index.items()}
answer_tokenizer.word_index = {k: (v + 1) for k, v in answer_tokenizer.word_index.items()}
question_tokenizer.word_index['<pad>'] = 0
answer_tokenizer.word_index['<pad>'] = 0
question_tokenizer.index_word = {v: k for k, v in question_tokenizer.word_index.items()}
answer_tokenizer.index_word = {v: k for k, v in answer_tokenizer.word_index.items()}

# Convert texts to sequences
train_questions_seq = question_tokenizer.texts_to_sequences(train_questions)
train_answers_seq = answer_tokenizer.texts_to_sequences(train_answers)
val_questions_seq = question_tokenizer.texts_to_sequences(val_questions)
val_answers_seq = answer_tokenizer.texts_to_sequences(val_answers)

# Model parameters
embedding_dim = 64
latent_dim = 128
max_question_len = max(len(seq) for seq in train_questions_seq)
max_answer_len = max(len(seq) for seq in train_answers_seq)

train_questions_seq = [[index + 1 for index in seq] for seq in train_questions_seq]
train_answers_seq = [[index + 1 for index in seq] for seq in train_answers_seq]
val_questions_seq = [[index + 1 for index in seq] for seq in val_questions_seq]
val_answers_seq = [[index + 1 for index in seq] for seq in val_answers_seq]


# Padding
train_questions_padded = pad_sequences(train_questions_seq, maxlen=max_question_len, padding='post')
train_answers_padded = pad_sequences(train_answers_seq, maxlen=max_answer_len, padding='post')
val_questions_padded = pad_sequences(val_questions_seq, maxlen=max_question_len, padding='post')
val_answers_padded = pad_sequences(val_answers_seq, maxlen=max_answer_len, padding='post')

# Prepare decoder input/output
train_decoder_input = train_answers_padded[:, :-1]
train_decoder_output = train_answers_padded[:, 1:]
val_decoder_input = val_answers_padded[:, :-1]
val_decoder_output = val_answers_padded[:, 1:]

# Adjust input_dim in the embedding layer
vocab_size = max_vocab_size + 2

# Model architecture
encoder_inputs = Input(shape=(max_question_len,))
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_embedding = Dropout(0.4)(encoder_embedding)
encoder_lstm_1 = LSTM(latent_dim, return_sequences=True, dropout=0.4, recurrent_dropout=0.4,
                     kernel_regularizer=l2(1e-3))(encoder_embedding)
encoder_lstm_1 = LayerNormalization()(encoder_lstm_1)

encoder_lstm_2 = LSTM(latent_dim, return_state=True, dropout=0.4, recurrent_dropout=0.4,
                     kernel_regularizer=l2(1e-3))
encoder_outputs, state_h, state_c = encoder_lstm_2(encoder_lstm_1)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_answer_len - 1,))

decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(decoder_inputs)
decoder_embedding = Dropout(0.4)(decoder_embedding)

decoder_lstm_1 = LSTM(latent_dim, return_sequences=True, dropout=0.4, recurrent_dropout=0.4,
                     kernel_regularizer=l2(1e-3))(decoder_embedding, initial_state=encoder_states)
decoder_lstm_1 = LayerNormalization()(decoder_lstm_1)

decoder_lstm_2 = LSTM(latent_dim, return_sequences=True, 
                      dropout=0.4, recurrent_dropout=0.4,
                      kernel_regularizer=l2(1e-3))(decoder_lstm_1)
decoder_lstm_2 = LayerNormalization()(decoder_lstm_2)

decoder_dense = Dense(vocab_size, activation='softmax', kernel_regularizer=l2(1e-3))
decoder_outputs = decoder_dense(decoder_lstm_2)
# Compile the model

# model = load_model("seq2seq_model.h5", compile=False)
optimizer = Adam(learning_rate=5e-4,clipnorm=1.0)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

class CustomEarlyStopping(EarlyStopping):
    def __init__(self, monitor='accuracy', target=0.90, **kwargs):
        super().__init__(monitor=monitor, **kwargs)
        self.target = target

    def on_epoch_end(self, epoch, logs=None):
        current_accuracy = logs.get(self.monitor)
        if current_accuracy is not None and current_accuracy >= self.target:
            print(f"\nEarly stopping: Reached {self.monitor} of {current_accuracy:.4f}, stopping training.")
            self.model.stop_training = True

# Callbacks
checkpoint_path = "seq2seq_checkpoint.weights.h5"
checkpoint = ModelCheckpoint(filepath=checkpoint_path, monitor='val_loss', save_best_only=True, save_weights_only=True, verbose=1)
early_stopping = CustomEarlyStopping(monitor='val_loss', target=0.3, patience=5, restore_best_weights=True, verbose=1)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)


# Train the model
history = model.fit(
    [train_questions_padded, train_decoder_input],
    train_decoder_output,
    validation_data=([val_questions_padded, val_decoder_input], val_decoder_output),
    batch_size=32,
    epochs=5,
    callbacks=[checkpoint, early_stopping, lr_scheduler],
    verbose=1
)

# Save tokenizers 
with open('question_tokenizer.pkl', 'wb') as handle: 
    pickle.dump(question_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) 
with open('answer_tokenizer.pkl', 'wb') as handle: 
    pickle.dump(answer_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Load the best model weights
model.load_weights(checkpoint_path)

# Save the entire model after training
model.save("seq2seq_model.h5")
print("Entire model saved as seq2seq_model.h5")

# Define and save the encoder model separately
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.save("encoder_model.h5")
print("Encoder model saved as encoder_model.h5")

# Create and save decoder model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Reuse the existing decoder embedding and LSTM layers
# Create decoder model for inference
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Decoder inputs for single time step
decoder_inputs = Input(shape=(1,))

# Reuse embedding layer
decoder_x = decoder_embedding(decoder_inputs)

# First LSTM layer
decoder_x = LSTM(latent_dim, return_sequences=True, 
                 dropout=0.4, recurrent_dropout=0.4,
                 kernel_regularizer=l2(1e-3))(
                     decoder_x, 
                     initial_state=decoder_states_inputs
                 )
decoder_x = LayerNormalization()(decoder_x)

# Second LSTM layer
decoder_x = LSTM(latent_dim, return_sequences=True, 
                 dropout=0.4, recurrent_dropout=0.4,
                 kernel_regularizer=l2(1e-3))(decoder_x)
decoder_x = LayerNormalization()(decoder_x)

# Dense layer
decoder_outputs = decoder_dense(decoder_x)

# Create decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs, 
    [decoder_outputs] + decoder_states_inputs
)
decoder_model.save("decoder_model.h5")
print("Decoder model saved as decoder_model.h5")

# Evaluate on the validation set
print("\nEvaluation Results:")
model.evaluate(
    [val_questions_padded, val_decoder_input],
    val_decoder_output
)

I0000 00:00:1731944675.158626    1153 service.cc:145] XLA service 0x58feac4594b0 initialized for platform TPU (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1731944675.158679    1153 service.cc:153]   StreamExecutor device (0): TPU, 2a886c8
I0000 00:00:1731944675.158684    1153 service.cc:153]   StreamExecutor device (1): TPU, 2a886c8
I0000 00:00:1731944675.158687    1153 service.cc:153]   StreamExecutor device (2): TPU, 2a886c8
I0000 00:00:1731944675.158691    1153 service.cc:153]   StreamExecutor device (3): TPU, 2a886c8
I0000 00:00:1731944675.158694    1153 service.cc:153]   StreamExecutor device (4): TPU, 2a886c8
I0000 00:00:1731944675.158696    1153 service.cc:153]   StreamExecutor device (5): TPU, 2a886c8
I0000 00:00:1731944675.158699    1153 service.cc:153]   StreamExecutor device (6): TPU, 2a886c8
I0000 00:00:1731944675.158702    1153 service.cc:153]   StreamExecutor device (7): TPU, 2a886c8


Epoch 1/5


NotFoundError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "/usr/local/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/local/lib/python3.10/runpy.py", line 86, in _run_code

  File "/usr/local/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/usr/local/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/usr/local/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/local/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/local/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/local/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/usr/local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/usr/local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/usr/local/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/usr/local/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/tmp/ipykernel_1153/1355949510.py", line 127, in <module>

  File "/usr/local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 320, in fit

  File "/usr/local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 121, in one_step_on_iterator

could not find registered transfer manager for platform Host -- check target linkage
	 [[{{node StatefulPartitionedCall}}]] [Op:__inference_one_step_on_iterator_11017]

# Testing using Input

In [None]:
def decode_sequence(input_seq, encoder_model, decoder_model, answer_tokenizer, max_answer_len=50, beam_width=3):
    # Get encoder states
    states_value = encoder_model.predict(input_seq, verbose=0)
    
    # Initialize beam search with start token
    start_token_id = answer_tokenizer.word_index.get('<start>', 1)
    sequences = [([start_token_id], 0.0, states_value)] 
    finished_sequences = []
    
    # Beam search
    while len(sequences) > 0 and len(finished_sequences) < beam_width:
        new_sequences = []
        
        for seq, score, current_states in sequences:
            if len(seq) > max_answer_len:
                finished_sequences.append((seq, score))
                continue
                
            # Prepare decoder input
            target_seq = py.array([seq[-1]]).reshape(1, 1)
            
            # Get predictions
            output_tokens, h, c = decoder_model.predict(
                [target_seq] + current_states, 
                verbose=0
            )
            
            # Get top k predictions
            last_token_probs = output_tokens[0, -1, :]
            top_k_indices = last_token_probs.argsort()[-beam_width:][::-1]
            
            # Add new candidates
            for i in top_k_indices:
                new_seq = seq + [i]
                new_score = score - py.log(last_token_probs[i] + 1e-8)
                new_states = [h, c]
                
                # Check if sequence is complete
                if i == answer_tokenizer.word_index.get('<end>', 2) or len(new_seq) >= max_answer_len:
                    finished_sequences.append((new_seq, new_score))
                else:
                    new_sequences.append((new_seq, new_score, new_states))
        
        # Keep top beam_width sequences
        sequences = sorted(new_sequences, key=lambda x: x[1])[:beam_width]
        
        if not sequences and not finished_sequences:
            break
    
    # If no sequence finished, take the best ongoing one
    if not finished_sequences and sequences:
        finished_sequences = [(seq, score) for seq, score, _ in sequences]
    
    # Sort and get the best sequence
    if finished_sequences:
        best_seq, _ = min(finished_sequences, key=lambda x: x[1])
    else:
        return "I apologize, but I couldn't generate a proper response."

    # Decode the sequence
    decoded_tokens = []
    for token_id in best_seq[1:-1]:  # Skip start and end tokens
        word = answer_tokenizer.index_word.get(token_id, '')
        if word not in ['<start>', '<end>', '<pad>', '<OOV>']:
            decoded_tokens.append(word)
    
    return ' '.join(decoded_tokens)

def preprocess_input(question, question_tokenizer, max_question_len):
    # Text cleaning
    question = question.lower().strip()
    question = re.sub(r'[^\w\s?]', '', question)  # Keep question marks
    question = re.sub(r'\s+', ' ', question)
    
    # Tokenization with handling of unknown tokens
    question_seq = question_tokenizer.texts_to_sequences([question])
    print(question_seq)
    question_padded = pad_sequences(question_seq, maxlen=max_question_len, padding='post')
    
    return question_padded

def initialize_chatbot(encoder_path, decoder_path, question_tokenizer_path, answer_tokenizer_path):
    try:
        # Load models
        encoder_model = load_model(encoder_path)
        decoder_model = load_model(decoder_path)
        
        # Load tokenizers
        with open(question_tokenizer_path, 'rb') as handle:
            question_tokenizer = pickle.load(handle)
        with open(answer_tokenizer_path, 'rb') as handle:
            answer_tokenizer = pickle.load(handle)
            
        return encoder_model, decoder_model, question_tokenizer, answer_tokenizer
    
    except Exception as e:
        raise Exception(f"Error initializing chatbot: {str(e)}")

def get_response(question, encoder_model, decoder_model, question_tokenizer, answer_tokenizer, max_question_len=50, beam_width=3):
    try:
        # Preprocess input
        preprocessed_input = preprocess_input(question, question_tokenizer, max_question_len)
        
        # Generate response
        response = decode_sequence(
            preprocessed_input,
            encoder_model,
            decoder_model,
            answer_tokenizer,
            max_answer_len=50,
            beam_width=beam_width
        )
        
        # Post-process response
        response = response.strip()
        if not response or response.isspace():
            return "I apologize, but I couldn't generate a proper response."
        
        # Add period if missing
        if not response.endswith(('.', '?', '!')):
            response += '.'
            
        # Capitalize first letter
        response = response[0].upper() + response[1:]
        
        return response
        
    except Exception as e:
        return f"An error occurred: {str(e)}"

encoder_model, decoder_model, question_tokenizer, answer_tokenizer = initialize_chatbot(
    'encoder_model.h5',
    'decoder_model.h5',
    'question_tokenizer.pkl',
    'answer_tokenizer.pkl'
)


# Get response
question = "How to prevent Glaucoma ?"
response = get_response(
    question,
    encoder_model,
    decoder_model,
    question_tokenizer,
    answer_tokenizer,
    max_question_len=max_question_len
)
print(f"Question: {question}")
print(f"Response: {response}")
