In [None]:
!pip install tensorflow==2.16.1
!pip install keras==3.1.1

Collecting tensorflow==2.16.1
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting h5py>=3.10.0 (from tensorflow==2.16.1)
  Downloading h5py-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
Collecting ml-dtypes~=0.3.1 (from tensorflow==2.16.1)
  Downloading ml_dtypes-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.17,>=2.16 (from tensorflow==2.16.1)
  Downloading tensorboard-2.16.2-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m43.8 MB/

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import string
import re
import unicodedata
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import Callback
# Load data
data = pd.read_csv("chat_health.csv")
data = data.head(2000)

# Define function to convert unicode to ASCII
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

# Define text cleaning function
def clean_text(text):
    text = unicode_to_ascii(text.lower().strip())
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\r", "", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub("(\\W)"," ",text)
    text = re.sub('\S*\d\S*\s*','', text)
    return text

data["short_question"] = data.short_question.apply(clean_text)
data["short_answer"] = data.short_answer.apply(clean_text)

# Text preprocessing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['short_question'].tolist() + data['short_answer'].tolist())
vocab_size = len(tokenizer.word_index) + 1

questions_seq = tokenizer.texts_to_sequences(data['short_question'])
answers_seq = tokenizer.texts_to_sequences(data['short_answer'])

max_length = max(max(len(x) for x in questions_seq), max(len(x) for x in answers_seq))
questions_padded = pad_sequences(questions_seq, maxlen=max_length, padding='post')
answers_padded = pad_sequences(answers_seq, maxlen=max_length, padding='post')

# Split dataset
train_questions, val_questions, train_answers, val_answers = train_test_split(
    questions_padded, answers_padded, test_size=0.1, random_state=42)

# Prepare training and validation datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_questions, train_answers))
train_dataset = train_dataset.shuffle(buffer_size=10000).batch(16).repeat()

val_dataset = tf.data.Dataset.from_tensor_slices((val_questions, val_answers))
val_dataset = val_dataset.batch(16).repeat()

# Build model
model = Sequential([
    Embedding(vocab_size, 128),
    Bidirectional(GRU(256, return_sequences=True)),
    Dropout(0.5),
    BatchNormalization(),
    Bidirectional(GRU(256, return_sequences=True)),
    Dropout(0.5),
    Dense(vocab_size, activation='softmax')
])

optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Adjust batch_size or directly use the entire dataset
batch_size = 16
class CombinedStoppingCallback(Callback):
    def __init__(self, loss_threshold=0.01, improvement_threshold=0.01, patience=200):
        super(CombinedStoppingCallback, self).__init__()
        self.loss_threshold = loss_threshold  # Stop training if loss is below this value
        self.improvement_threshold = improvement_threshold  # Consider stopping if improvement is less than this value
        self.patience = patience  # Max consecutive epochs to wait before deciding to stop
        self.best_loss = float('inf')
        self.best_epoch = 0

    def on_epoch_end(self, epoch, logs=None):
        current_loss = logs.get('loss')
        if current_loss is None:
            return

        # Check if the loss has reached below the threshold
        if current_loss < self.loss_threshold:
            print(f"\nEpoch {epoch}: Stopping training as loss {current_loss} is below threshold {self.loss_threshold}.")
            self.model.stop_training = True

        # Check if there is a significant improvement in loss
        if current_loss < self.best_loss:
            self.best_loss = current_loss
            self.best_epoch = epoch
        else:
            # Check if it has been 'patience' epochs since the last best loss and the improvement is not enough
            if (epoch - self.best_epoch) >= self.patience and (self.best_loss - current_loss) < self.improvement_threshold:
                print(f"\nEpoch {epoch}: No significant improvement in loss for {self.patience} epochs. Stopping training.")
                self.model.stop_training = True

# Create an instance of CombinedStoppingCallback
stopping_callback = CombinedStoppingCallback(loss_threshold=0.1, improvement_threshold=0.1, patience=30)

# Use the model and save training history
history = model.fit(
    train_dataset,
    epochs=2000,
    steps_per_epoch=max(1, len(train_questions) // batch_size),
    validation_data=val_dataset,
    validation_steps=max(1, len(val_questions) // batch_size),
    callbacks=[stopping_callback]
)

Epoch 1/2000
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 1s/step - accuracy: 0.4597 - loss: 5.1581 - val_accuracy: 0.5034 - val_loss: 4.0792
Epoch 2/2000
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 1s/step - accuracy: 0.5208 - loss: 3.6227 - val_accuracy: 0.5009 - val_loss: 3.8926
Epoch 3/2000
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 1s/step - accuracy: 0.5164 - loss: 3.6170 - val_accuracy: 0.5010 - val_loss: 3.8380
Epoch 4/2000
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 1s/step - accuracy: 0.5320 - loss: 3.4931 - val_accuracy: 0.5046 - val_loss: 3.8539
Epoch 5/2000
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 2s/step - accuracy: 0.5173 - loss: 3.5264 - val_accuracy: 0.5089 - val_loss: 3.8326
Epoch 6/2000
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 2s/step - accuracy: 0.5309 - loss: 3.4101 - val_accuracy: 0.5061 - val_loss: 3.9118
Epoch 7/20

In [None]:
model.summary()

In [None]:
model.save_weights('GRU_2000.weights.h5')


# save Tokenizer
import pickle
with open('tokenizer_GRU_2000.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from google.colab import files
files.download('GRU_2000.weights.h5')
files.download('tokenizer_GRU_2000.pickle')

In [None]:
vocab_size

In [None]:
max_length

In [None]:
from tensorflow.keras.models import load_model
import pickle



model1 = Sequential([
    Embedding(vocab_size, 128),
    Bidirectional(GRU(256, return_sequences=True)),
    Dropout(0.5),
    BatchNormalization(),
    Bidirectional(GRU(256, return_sequences=True)),
    Dropout(0.5),
    Dense(vocab_size, activation='softmax')
])

model1.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model1.build(input_shape=(None, max_length))


model1.load_weights('my_model_weights.weights.h5')
model1.summary()

In [None]:
# generate answer
def generate_answer(question,model):
    question_seq = tokenizer.texts_to_sequences([question])
    question_padded = pad_sequences(question_seq, maxlen=max_length, padding='post')
    prediction = model.predict(question_padded)
    predicted_indices = np.argmax(prediction, axis=-1)[0]
    predicted_words = ' '.join([tokenizer.index_word[i] for i in predicted_indices if i != 0])
    return predicted_words

In [None]:
testData = pd.read_csv('chat_health.csv').head(50)

In [None]:
from nltk.translate.bleu_score import corpus_bleu
testData['generated_answer'] = testData['short_question'].apply(lambda q: generate_answer(q, model1))

references = testData['short_answer'].apply(lambda a: [a.split()]).tolist()
candidates = testData['generated_answer'].apply(lambda a: a.split()).tolist()

bleu_score = corpus_bleu(references, candidates)
print("BLEU Score:", bleu_score)

In [None]:
import time
start_time = time.time()
testData['generated_answer'] = testData['short_question'].head(50).apply(lambda q: generate_answer(q, model1))
end_time = time.time()
response_time = end_time - start_time
average_response_time = response_time / 50

print(f"Average response time per record: {average_response_time} seconds")

In [None]:
import matplotlib.pyplot as plt
# Plotting loss curves
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss Curves')
plt.show()

# Plotting accuracy curves
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Learning Curves')
plt.show()