# Chat Bot using Text Embeddings and comparing similarity with Cosine Similarity

## Google Text Embedding Model: textembedding-gecko@001

In [2]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ''#Path to your Google Cloud's Service Account Crediatials JSON file

# Generating Base Embeddings

### Base Embeddings for the dataset 

In [None]:
from vertexai.language_models import TextEmbeddingModel

In [None]:
def text_embedding(data, batch_size=5):
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
    
    questions = data["Processed Questions"].tolist()
    
    embedding_vectors = []

    question_list = []

    for i in range(0, len(questions), batch_size):
        batch = questions[i:i + batch_size]

        embeddings = model.get_embeddings(batch)
        
        for question, embedding in zip(batch, embeddings):
            vector = embedding.values
            embedding_vectors.append(vector)
            question_list.append(question)
    
    embedding_df = pd.DataFrame({"Question": question_list, "Embedding": embedding_vectors})
    
    embedding_df.to_csv("embeddings.csv", index=False)
    
    return embedding_vectors

# Inference

### Pytorch's Cosine Similarity Model

In [None]:
from sentence_transformers import util

In [None]:
import pandas as pd
import numpy as np
import ast
from sklearn.metrics.pairwise import cosine_similarity

# Load the CSV file containing question embeddings and answers
embedding_df = pd.read_csv("embeddings.csv")
embedding_df['Embedding'] = embedding_df['Embedding'].apply(ast.literal_eval)

In [None]:
embedding_df.head()

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
import numpy as np
from scipy.spatial.distance import cosine

### Preprocessing Data using SpaCy

In [None]:
def preprocess_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_punct])

### Returns the predicted answer by first calculating the maximum cosine similarity with the Base Embeddings previously generated, calculating the index with which the similarity score was the maximum, returning the answer on that index

In [None]:
def get_most_similar_question(user_question, question_embeddings_df):
    user_question = preprocess_text(user_question)
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
    user_question_embedding = model.get_embeddings([user_question])
    for embedding in user_question_embedding:
        user_question_embedding = embedding.values
    
    question_embeddings = question_embeddings_df['Embedding']
    sim_score = []
    for i in question_embeddings:
        sim_score.append(util.pytorch_cos_sim(user_question_embedding, i).item())

    sim_arg = sim_score.index(max(sim_score))
    return embedding_df['Answer'][sim_arg]

### Evaluation Function to test the Chat Bot with providing the test questions with ground truth answers in the format 'test_dataset' variable is

In [None]:
import time

def evaluate_chatbot(test_dataset, embedding_df):
    correct_answers = 0
    total_questions = len(test_dataset)

    
    for test_question, ground_truth_answer in test_dataset:
        start_time = time.time()
    
        response = get_most_similar_question(test_question, embedding_df)

        end_time = time.time()
        
        print(f"Time taken: {end_time - start_time}\n")
        
        if response == ground_truth_answer:
            correct_answers += 1
        else:
            print("\nWrong Answer\n")
        
        print(f"\n\nQuestion: {test_question}\n")
        print(f"Chatbot's Response: {response}\n")
        print(f"Ground Truth Answer: {ground_truth_answer}\n")
    
    accuracy = (correct_answers / total_questions) * 100
    return accuracy

test_dataset = [
    ("Question1","Answer1"),
    ("Question2", "Answer2")
]


accuracy = evaluate_chatbot(test_dataset, embedding_df)
print(f"Accuracy: {accuracy}%")