<a href="https://colab.research.google.com/github/Pratik-Nikam/FAQ_Model_tf/blob/main/FAQ_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import keras

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split



In [4]:
!pwd
!ls

/content
sample_data


In [4]:
# Read the data from the text file
import json
file_path = "/content/sample_data/HDFC_Faq.txt"
with open(file_path, "r") as file:
    data = file.read()

parsed_data = json.loads(data)

In [None]:
parsed_data

In [5]:
formatted_data = [{"question": item["question"], "answer": item["answer"]} for item in parsed_data]


In [14]:

# Data preparation
def prepare_data(faqs, max_length=100):
    questions = [faq["question"] for faq in faqs]
    answers = [faq["answer"] for faq in faqs]

    # Create and fit tokenizer for questions
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(questions + answers)

    # Convert text to sequences
    question_sequences = tokenizer.texts_to_sequences(questions)
    answer_sequences = tokenizer.texts_to_sequences(answers)

    # Pad sequences
    X = pad_sequences(question_sequences, maxlen=max_length)
    y = pad_sequences(answer_sequences, maxlen=max_length)

    return X, y, tokenizer


In [20]:
# Model architecture
def create_model(vocab_size, max_length=100, embedding_dim=100):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(max_length, activation='sigmoid')
    ])

    model.compile(optimizer='adam',
                 loss='binary_crossentropy',
                 metrics=['accuracy'])

    return model
X, y, tokenizer = prepare_data(formatted_data)

In [18]:


def train_faq_model(X, y, tokenizer, epochs=50):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create model
    vocab_size = len(tokenizer.word_index) + 1
    model = create_model(vocab_size)

    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        validation_data=(X_test, y_test),
        batch_size=32
    )

    return model, history


In [21]:
model, history = train_faq_model(X, y, tokenizer)


Epoch 1/50




[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 220ms/step - accuracy: 0.0187 - loss: -157.7616 - val_accuracy: 0.0915 - val_loss: -1774.0793
Epoch 2/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 204ms/step - accuracy: 0.0248 - loss: -3030.8865 - val_accuracy: 0.0223 - val_loss: -8635.7451
Epoch 3/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 221ms/step - accuracy: 0.0494 - loss: -10728.5967 - val_accuracy: 0.0223 - val_loss: -21104.8242
Epoch 4/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 203ms/step - accuracy: 0.0536 - loss: -24823.7129 - val_accuracy: 0.0223 - val_loss: -39272.4453
Epoch 5/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 212ms/step - accuracy: 0.0555 - loss: -45023.8711 - val_accuracy: 0.0915 - val_loss: -63067.8125
Epoch 6/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 205ms/step - accuracy: 0.0487 - loss: -65626.1094 - val_accuracy: 0.0915 - 

In [22]:
def get_answer(question, model, tokenizer, max_length=100):
    # Prepare question
    question_seq = tokenizer.texts_to_sequences([question])
    question_padded = pad_sequences(question_seq, maxlen=max_length)

    # Get prediction
    pred_seq = model.predict(question_padded)

    # Convert prediction back to text
    pred_text = []
    for idx in pred_seq[0]:
        for word, index in tokenizer.word_index.items():
            if index == int(idx):
                pred_text.append(word)

    return " ".join(pred_text)


In [23]:
new_question = "Can Chip Credit cards be used anywhere?"
answer = get_answer(new_question, model, tokenizer)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step


In [24]:
answer

'the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the'

In [29]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow import keras
import numpy as np
import json
from sklearn.model_selection import train_test_split

class FAQDataset:
    def __init__(self, questions, answers, batch_size=16):
        self.questions = questions
        self.answers = answers
        self.batch_size = batch_size

    def create_dataset(self):
        dataset = tf.data.Dataset.from_tensor_slices((
            self.questions,
            self.answers
        ))
        dataset = dataset.shuffle(1000).batch(self.batch_size)
        return dataset

class QAModel(keras.Model):
    def __init__(self):
        super(QAModel, self).__init__()
        # Using BERT preprocessor and encoder from TF Hub
        self.preprocessor = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
        self.encoder = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",
            trainable=True)

        # Dense layers for similarity computation
        self.dense = keras.layers.Dense(768, activation='relu')
        self.similarity = keras.layers.Dot(axes=1, normalize=True)

    def call(self, inputs):
        question, answer = inputs

        # Preprocess questions and answers
        question_preprocessed = self.preprocessor(question)
        answer_preprocessed = self.preprocessor(answer)

        # Get BERT embeddings
        question_embeddings = self.encoder(question_preprocessed)['pooled_output']
        answer_embeddings = self.encoder(answer_preprocessed)['pooled_output']

        # Process through dense layer
        question_features = self.dense(question_embeddings)
        answer_features = self.dense(answer_embeddings)

        # Calculate similarity
        similarity = self.similarity([question_features, answer_features])

        return similarity

def prepare_data(data):
    # Load data
    # with open(data_path, 'r') as f:
    #     data = json.load(f)

    questions = [item['question'] for item in data]
    answers = [item['answer'] for item in data]

    # Split data
    q_train, q_val, a_train, a_val = train_test_split(
        questions, answers, test_size=0.2, random_state=42
    )

    return (q_train, a_train), (q_val, a_val)

def train_model(data_path, epochs=10, batch_size=16, learning_rate=2e-5):
    # Prepare data
    (q_train, a_train), (q_val, a_val) = prepare_data(data_path)

    # Create datasets
    train_dataset = FAQDataset(q_train, a_train, batch_size).create_dataset()
    val_dataset = FAQDataset(q_val, a_val, batch_size).create_dataset()

    # Initialize model
    model = QAModel()

    # Compile model
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    loss = keras.losses.MeanSquaredError()
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    # Define target (all ones since we want high similarity for matching pairs)
    def generate_ones(x, y):
        batch_size_x = tf.shape(x)[0]

        return (x, y), tf.ones(shape=(batch_size_x,))

    train_dataset = train_dataset.map(generate_ones)
    val_dataset = val_dataset.map(generate_ones)

    # Train model
    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=epochs,
        verbose=1
    )

    return model, history

def find_best_answer(model, question, answer_bank):
    best_score = -1
    best_answer = None

    # Convert question to list for batch processing
    question_batch = [question] * len(answer_bank)

    # Get similarities for all answers at once
    similarities = model.predict((question_batch, answer_bank))

    # Find best match
    best_idx = np.argmax(similarities)
    best_score = similarities[best_idx]
    best_answer = answer_bank[best_idx]

    return best_answer, float(best_score)

# Function to save the model
def save_qa_model(model, save_path):
    model.save(save_path)

# Function to load the model
def load_qa_model(load_path):
    return tf.keras.models.load_model(load_path, custom_objects={'QAModel': QAModel})


In [8]:
data = formatted_data[:1000]

In [9]:
len(data)

1000

In [10]:
type(data)


list

In [12]:
json_string = json.dumps(data)

In [17]:
json_string

'[{"question": "How do I change my password?", "answer": "After you have logged in, you can change your password using the \\"Change password\\" option in the top part of the screen. You have to type your current password and the new password you have chosen in their respective boxes."}, {"question": "When will I receive my changed ATM PIN?", "answer": "You will receive your new ATM PIN by post within 10 days from when your request has been submitted."}, {"question": "Can I get my newly generated PIN online?", "answer": "No, for security reasons we send you your ATM PIN only by post."}, {"question": "How can I register for Autopay?", "answer": "To register for Autopay: Step 1: Click on the \\"Autopay register\\" link on the left side margin. Step 2: Select the Credit Card number that you want to register for the Autopay facility and your HDFC Bank account number from which you want your Credit Card payments to be made. Step 3: If you want the full amount of your statement to be paid fr

In [30]:
model, history = train_model(data)

Epoch 1/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m911s[0m 18s/step - accuracy: 0.9973 - loss: 0.0120 - val_accuracy: 0.9950 - val_loss: 0.0053
Epoch 2/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m910s[0m 18s/step - accuracy: 0.9979 - loss: 0.0037 - val_accuracy: 0.9950 - val_loss: 0.0033
Epoch 3/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m887s[0m 17s/step - accuracy: 0.9979 - loss: 0.0027 - val_accuracy: 0.9950 - val_loss: 0.0024
Epoch 4/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m845s[0m 17s/step - accuracy: 0.9995 - loss: 0.0015 - val_accuracy: 0.9950 - val_loss: 0.0019
Epoch 5/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m929s[0m 18s/step - accuracy: 0.9993 - loss: 0.0012 - val_accuracy: 1.0000 - val_loss: 0.0015
Epoch 6/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m853s[0m 17s/step - accuracy: 0.9998 - loss: 0.0011 - val_accuracy: 1.0000 - val_loss: 0.0013
Epoch 7/10
[1m50/50[0m [3

In [34]:
save_qa_model(model, '/content/Models/qa_model.keras')

In [35]:
q = "When will I receive my changed ATM PIN?"

In [36]:
answer_bank = [item['answer'] for item in data]

In [37]:
len(answer_bank)

1000

In [40]:
question_batch = [q] * len(answer_bank)

In [42]:
question_batch = np.array(question_batch)

In [43]:
answer_batch = np.array(answer_bank)

In [45]:
similarities = model.predict((question_batch, answer_batch))

ValueError: Invalid dtype: str1248

In [46]:
question_batch = tf.constant([q] * len(answer_bank))  # Convert to TensorFlow tensor
answer_batch = tf.constant(answer_bank)  # Convert to TensorFlow tensor
similarities = model.predict((question_batch, answer_batch))

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m871s[0m 27s/step


In [None]:
similarities


In [48]:
best_idx = np.argmax(similarities)

In [49]:
best_score = similarities[best_idx]
best_answer = answer_bank[best_idx]

In [50]:
best_idx

816

In [51]:
best_score


array([0.9956048], dtype=float32)

In [52]:
best_answer


'The interest rate on the SSY scheme will be notified by the Ministry of Finance from time to time.'