# Embedding Data Updation for the Text-Embedding based Chat Bot 

In [1]:
import os
import ast
import time
import spacy
import openai
import numpy as np
import pandas as pd 
from sentence_transformers import util
from sklearn.metrics.pairwise import cosine_similarity
from vertexai.language_models import TextEmbeddingModel
# from vertexai.language_models import TextGenerationModel

2023-11-24 17:05:29.333725: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [2]:
openai.api_key = "" # Replace with OPENAI API Key

In [3]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] =  "" # Replace with GCP Credentials JSON File Path

## Generate Testing Variations from Question/Answer CSV using OPENAI API

In [4]:
def generate_variations_openai(prompt):
    full_prompt = f"Generate a 'single' unique variation for the question below and return another single question without changing the semantics and the meaning, only the wording should be different: \"{prompt}\""
    
    response = openai.Completion.create(
    model="text-davinci-003",
    prompt=full_prompt,
    temperature=0.1
    )

    output = list()
    for k in response['choices']:
        output.append(k['text'].strip())
    return ''.join(output)

## Generate Testing Variations from Question/Answer CSV using GCP Vertex AI API

In [5]:
def generate_variations_vertex(prompt): 
    model = TextGenerationModel.from_pretrained("text-bison@001")
    
    full_prompt = f"Generate a 'single' unique variation for the question below and return another single question without changing the semantics and the meaning, only the wording should be different: \"{prompt}\""
    
    response = model.predict(prompt=full_prompt, temperature=0.4)

    return (response.text.strip())

## New questions CSV file

In [5]:
new_question_data_file = 'Chat_Bot_New_Questions.csv'

In [None]:
df = pd.read_csv(new_question_data_file, encoding='utf-8')

df.head()

In [None]:
df

## Generating a variation for each new question from a language model API

In [None]:
response_list = []
answer_list = []

for i in range(0, len(df)):
    response = generate_variations_openai(df['Question'][i])
    print(i)

    print(f"{i}: {response}")
    # print(df['Question'][i].strip())

    response_list.append(df['Question'][i])
    response_list.append(response)
    answer_list.append(df['Answer'][i])
    answer_list.append(df['Answer'][i])

In [17]:
df = pd.DataFrame({'Question': response_list, 'Answer': answer_list})
len(df)

20

In [None]:
df.head()

In [19]:
len(df)

20

## Text Embedding for the newly added and generated questions

In [20]:
def text_embedding(data, batch_size=5):
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
    
    questions = data["Question"].tolist()
    
    embedding_vectors = []

    question_list = []

    for i in range(0, len(questions), batch_size):
        print(i)
        batch = questions[i:i + batch_size]

        embeddings = model.get_embeddings(batch)
        
        for question, embedding in zip(batch, embeddings):
            vector = embedding.values
            embedding_vectors.append(vector)
    
    return embedding_vectors

In [21]:
emb_vec = text_embedding(df)
len(emb_vec)

0
5
10
15


20

In [22]:
df['Embedding'] = emb_vec

In [None]:
df

## Previous Question / Answer Embedding Data (To be updated yet)

In [30]:
Old_question_data_file = "Question_Variations_Embeddings.csv"

In [None]:
embedding_df = pd.read_csv(Old_question_data_file)
embedding_df['Embedding'] = embedding_df['Embedding'].apply(ast.literal_eval)
embedding_df

## Concatinating the new dataframe to the previous dataframe to update the whole Embedding Data

In [None]:
final_df = pd.concat([embedding_df, df])
final_df

In [None]:
final_df

## Saving the updated dataframe to CSV

In [34]:
final_df.to_csv("Question_Variations_Embeddings_Updated.csv", index=False)