# **Generative Search: Development and Evaluation**

## **Install packages**

In [None]:
!pip install -q -U sentence-transformers
!pip install -q torch==2.1.0
!pip install -q -U transformers
!pip install -q accelerate
!pip install -q -U langchain

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h

## **Import packages**

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import pandas as pd
import numpy as np
import json
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import transformers
import torch
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
from sklearn.metrics.pairwise import cosine_similarity
import warnings

warnings.filterwarnings('ignore')

## **Load Models**

In [None]:
# Load LLM
generate_text = pipeline(
    model="databricks/dolly-v2-3b",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    temperature=0.25,
    max_time=30,
    device_map="auto",
    return_full_text=True)

# template for an instruction with input
prompt_with_context = PromptTemplate(
    input_variables=["instruction", "context"],
    template="{instruction}\n\nInput:\n{context}")

# Load HuggingFacePipeline
hf_pipeline = HuggingFacePipeline(pipeline=generate_text)

# Define LLMChain
llm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)

# Load Bert sentence transformer
model_minilm = SentenceTransformer('all-MiniLM-L6-v2')

config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

instruct_pipeline.py:   0%|          | 0.00/9.16k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/databricks/dolly-v2-3b:
- instruct_pipeline.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## **Read Embedded Q&A dataset**

In [None]:
# Mount the files in drive folder
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Function: Read dataset
def read_data():
  df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TIS_Embedded_Q&A.csv')

  # Convert the string embeddings back to the original format
  df['embedded_questions'] =  df['embedded_questions'].apply(lambda x: np.array(json.loads(x), dtype=np.float32))

  # Convert embedded_question ad reponse to arrays
  embedded_questions = np.array(df['embedded_questions'].to_list())
  responses = np.array(df['response'].to_list())

  return df, embedded_questions, responses

df, embedded_questions, responses = read_data()

Mounted at /content/drive


## **Online Generative Search**

In [None]:
# Function: Embed input text
def embed_input_text(input_text):
    return model_minilm.encode([input_text])


# Function: Cosine similarity check to find top_response
def find_top_response(input_text_embedded):
  # Compute Cosine similarity
    cosine_score_ls = cosine_similarity(input_text_embedded, embedded_questions)

    # Flatten the list and filter on similarity score above 0.7
    similar_indices = [i for i, score in enumerate(cosine_score_ls[0]) if score > 0.7]

    if similar_indices:
        # Find the index of the top similarity score
        top_index = max(similar_indices, key=lambda i: cosine_score_ls[0][i])

        # Return the corresponding response
        return responses[top_index]
    else:
        return "No response found"


# Function: Generative search
def generative_search(input_text):

    # Embed input text
    input_text_embedded = embed_input_text(input_text)

    # Compute Cosine similarity to find top_response
    top_response = find_top_response(input_text_embedded)

    if top_response == "No response found":
        return "I'm sorry, but I don't have expertise in this topic; my training data is limited to only “CS 410 Text Information Systems”."
    else:
      return llm_context_chain.predict(instruction=input_text, context=top_response).lstrip()

In [None]:
# Run generative search
generative_search('Can you explain how do random variables play a role in the understanding of queries and documents?')

'Random variables provide a way to model the inherent uncertainty in the content and intent of queries and documents, offering a probabilistic perspective. For example, the probability that a given query will match a given set of documents can be estimated by sampling the random variables corresponding to the set of documents and the query. The estimated probability can then be normalized to provide a score that reflects the relative importance of the query.'

## **Results Evaluation**

### **Evaluation Sample Set Creation**

In [None]:
test_sample_df = df[['question', 'response']].sample(n=25, random_state=32)
test_sample_df.reset_index(drop=True, inplace=True)

# Generate next 25 question
data_dict = {
    'question': ['How can I order food?',
                 'How is the weather today?',
                 'What is Transmission Control Protocol?',
                 'If you could visit any place in the world, where would you go?',
                 'Can you explain what is the difference between MBA and iMBA?',
                 'What is computer science?',
                 'What is your go-to comfort food?',
                 'If you could master one skill instantly, what would it be?',
                 'What is the most interesting fact you know?',
                 'If you had a time machine, would you go to the past or the future?',
                 'Who is Leonardo Da Vinci?',
                 'How much is MacBook Air M2?',
                 'Can you explain what is strategic management?',
                 'If you could have any animal as a pet, what would it be?',
                 'What are the four seasons?',
                 'What is the Capital of Spain?',
                 'What is a skill you wish you had but have not had the time to learn?',
                 'How is the Master in Computer Science in University of Illinois Urbana Champaign?',
                 'I want to watch a movie, what do you recommend?',
                 'How can I publish a paper about NLP?', 'Should I study the investment subject?',
                 'What is a goal you have set for yourself recently?',
                 'If you could meet any celebrity, who would it be?',
                 'What is your favorite sport?',
                 'What is image analytics?'],
    'response': ["I'm sorry, but I don't have expertise in this topic; my training data is limited to only “CS 410 Text Information Systems”."] * 25
}

temp_df = pd.DataFrame(data_dict)

# Sample additional questions and responses
test_sample_df = test_sample_df.append(temp_df, ignore_index=True)

### **Quantative Evaluation**

In [None]:
%%time
# Run generative_search
test_sample_df['generated_response'] = test_sample_df.apply(lambda row: generative_search(row['question']), axis=1)

CPU times: user 54.2 s, sys: 2.73 s, total: 56.9 s
Wall time: 54.3 s


In [None]:
test_sample_df['cosine_similarity'] = None
for i in range(len(test_sample_df)):
  if test_sample_df['response'].iloc[i] == "I'm sorry, but I don't have expertise in this topic; my training data is limited to only “CS 410 Text Information Systems”.":
    if test_sample_df['generated_response'].iloc[i] == "I'm sorry, but I don't have expertise in this topic; my training data is limited to only “CS 410 Text Information Systems”.":
      test_sample_df['cosine_similarity'].iloc[i] = 1
    else:
      test_sample_df['cosine_similarity'].iloc[i] = 0
  else:
    cosine_sim_score = cosine_similarity(embed_input_text(test_sample_df['response'].iloc[i]), embed_input_text(test_sample_df['generated_response'].iloc[i]))
    cosine_sim_score = cosine_sim_score[0][0]
    test_sample_df['cosine_similarity'].iloc[i] = round(cosine_sim_score, 3)


In [None]:
# Export to csv
test_sample_df.to_csv('/content/drive/MyDrive/Colab Notebooks/Quantitative_Evaluation.csv', index=False)

###**Qualitative Evaluation**

In [None]:
# Export to csv
test_sample_qual_df = test_sample_df[['question', 'response', 'generated_response']].copy()
# Add is_generated_response_relevant? column
test_sample_qual_df['is_generated_response_relevant?'] = None
# Export to csv
test_sample_qual_df.to_csv('/content/drive/MyDrive/Colab Notebooks/Qualitative_Evaluation.csv', index=False)