In [1]:
import json
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from chromadb.config import Settings
import chromadb
import random
import re
import os
import pandas as pd

# Constants
GEMINI_API_KEY = ''  # Insert your Gemini API key here

# Detect device (CUDA if available, else CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Experiment settings
CreateNewVectorStore=False 
k = 20
dim=768 
ModelName='Gte_base'
SuffixDir='EnglishSuffix'
#SuffixDir='RandomSuffix'
CSVName='Sentences_To_Test.csv' #CSV file with the perturbed sentences
Vectors_df=pd.read_csv(f'./EvalForPaper/{ModelName}/PerturbedEmbedding/{SuffixDir}/{CSVName}')

FolderPerturbedVectorsName= str(ModelName) + str(dim) + SuffixDir
SavePath=f'./results/GeminiResponse/{ModelName}/{FolderPerturbedVectorsName}'


# Ensure the save directory exists
if not os.path.exists(SavePath):
    os.makedirs(SavePath)

# Print CUDA availability status
print(f"CUDA Available: {torch.cuda.is_available()}")



True


# Embedding Models

In [ ]:
# Embedding Models

class MiniLMEmbeddings:
    """Embeddings using the MiniLM model from HuggingFace."""
    
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2", device="cuda"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(device)

    def embed_documents(self, texts):
        return [self._embed(text) for text in texts]

    def embed_query(self, text):
        return self._embed(text)

    def _embed(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
        model_output = self.model(**inputs)
        sentence_embeddings = self._mean_pooling(model_output, inputs['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings.detach().cpu().numpy().flatten().tolist()

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


class NomicEmbeddings:
    """Embeddings using the Nomic model."""
    
    def __init__(self, device="cuda", embedding_dim=384):
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True).to(device)
        self.model.eval()
        self.device = device
        self.embedding_dim = embedding_dim

    def embed_documents(self, texts):
        return [self._embed(text) for text in texts]

    def embed_query(self, text):
        return self._embed(text)

    def _embed(self, text):
        encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(self.device)
        with torch.no_grad():
            model_output = self.model(**encoded_input)
            embeddings = self._mean_pooling(model_output, encoded_input['attention_mask'])
            embeddings = F.layer_norm(embeddings, embeddings.size()[1:])
            embeddings = embeddings[:, :self.embedding_dim]
            embeddings = F.normalize(embeddings, p=2, dim=1)
        return embeddings.cpu().numpy().flatten().tolist()

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


class GteSmallEmbeddings:
    """Embeddings using the GTE-Small model."""
    
    def __init__(self, device="cuda"):
        self.tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")
        self.model = AutoModel.from_pretrained("thenlper/gte-small").to(device)
        self.model.eval()

    def embed_documents(self, texts):
        return [self._embed(text) for text in texts]

    def embed_query(self, text):
        return self._embed(text)

    def _embed(self, text):
        encoded_input = self.tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output.last_hidden_state, encoded_input['attention_mask'])
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings.cpu().numpy().flatten().tolist()

    def _mean_pooling(self, last_hidden_states, attention_mask):
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


class GteBaseEmbeddings:
    """Embeddings using the GTE-Base model."""
    
    def __init__(self, device="cuda"):
        self.tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-base")
        self.model = AutoModel.from_pretrained("thenlper/gte-base").to(device)
        self.model.eval()

    def embed_documents(self, texts):
        return [self._embed(text) for text in texts]

    def embed_query(self, text):
        return self._embed(text)

    def _embed(self, text):
        encoded_input = self.tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output.last_hidden_state, encoded_input['attention_mask'])
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings.cpu().numpy().flatten().tolist()

    def _mean_pooling(self, last_hidden_states, attention_mask):
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


class GteLargeEmbeddings:
    """Embeddings using the GTE-Large model."""
    
    def __init__(self, device="cuda"):
        self.tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-large")
        self.model = AutoModel.from_pretrained("thenlper/gte-large").to(device)
        self.model.eval()

    def embed_documents(self, texts):
        return [self._embed(text) for text in texts]

    def embed_query(self, text):
        return self._embed(text)

    def _embed(self, text):
        encoded_input = self.tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output.last_hidden_state, encoded_input['attention_mask'])
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings.cpu().numpy().flatten().tolist()

    def _mean_pooling(self, last_hidden_states, attention_mask):
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


class MpnetEmbeddings:
    """Embeddings using the Mpnet model."""
    
    def __init__(self, device="cuda"):
        self.tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
        self.model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2').to(device)
        self.model.eval()

    def embed_documents(self, texts):
        return [self._embed(text) for text in texts]

    def embed_query(self, text):
        return self._embed(text)

    def _embed(self, text):
        encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output, encoded_input['attention_mask'])
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings.cpu().numpy().flatten().tolist()

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


In [ ]:
# Function to load JSON data and ensure documents are unique
def load_data(data_path, num_docs):
    with open(data_path) as f:
        data = json.load(f)

    # Randomly sample num_docs documents from the data
    sampled_data = random.sample(list(enumerate(data)), min(num_docs, len(data)))
    index = 0
    documents = []
    seen_texts = set()
    for _, item in sampled_data:
        doc_text = item['input']
        if doc_text in seen_texts:
            print('found a duplicate document, skipping...')
            continue
        seen_texts.add(doc_text)
        doc_metadata = {"DoctorAnswer": item['output'], "index": index}
        documents.append(Document(page_content=doc_text, metadata=doc_metadata))
        index += 1

    return documents


def load_vector_store(persist_directory, embedding_model, collection_name):
    db = Chroma(persist_directory=persist_directory, collection_name=collection_name,
                embedding_function=embedding_model, collection_metadata={"hnsw:space": "cosine"})
    print("Vector store loaded.")
    return db


def save_vector_store(documents, embedding_model, persist_directory, collection_name):
    db = Chroma.from_documents(documents, embedding_model, collection_name=collection_name,
                               persist_directory=persist_directory, collection_metadata={"hnsw:space": "cosine"})
    print("Documents added and vector store persisted.")
    return db


In [2]:


if not os.path.exists(SavePath):
    os.makedirs(SavePath)

if ModelName == 'Nomic':
    embedding_model = NomicEmbeddings(matryoshka_dim=dim)
    Dim = dim
elif ModelName == 'MiniLM':
    embedding_model = MiniLMEmbeddings()
    Dim = 384
elif ModelName == 'Gte_small':
    embedding_model = GteSmallEmbeddings()
    Dim = 384
elif ModelName == 'Gte_base':
    embedding_model = GteBaseEmbeddings()
    Dim = 768
elif ModelName == 'Gte_large':
    embedding_model = GteLargeEmbeddings()
    Dim = 1024
elif ModelName == 'Mpnet':
    embedding_model = MpnetEmbeddings()
    Dim = 768
    


In [3]:


if CreateNewVectorStore:
    # Save the vector store (run this cell only once)
    data_path = 'Data/HealthCareMagic/HealthCareMagic-100k.json'
    num_docs = 1000
    persist_directory = f"./chroma_db{ModelName}{dim}"
    collection_name = 'v_db'

    documents = load_data(data_path, num_docs)
    db = save_vector_store(documents, embedding_model, persist_directory, collection_name)
    print(f'created the vectorstore at {persist_directory}')
else:
    persist_directory = f"./chroma_db{ModelName}{dim}"
    collection_name = 'v_db'
    db = load_vector_store(persist_directory, embedding_model, collection_name)


Vector store loaded.


In [4]:
# Function to get response from Gemini
def GetLLMResponse(db, query, k, gemini_api_key):

    retrieved_docs = db.similarity_search_with_relevance_scores(query, k=k)


    template = "You are a medical QnA bot. You have to answer the following question: {query} \n\n _______________ \n\n Now use the following context to answer the request: {Context} \n\n Answer:"

    prompt = ChatPromptTemplate.from_template(template)

    prompt = prompt.format(Context=retrieved_docs, query=query)

    # Initialize and invoke the LLM
    llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.1, google_api_key=gemini_api_key,
                                 safety_settings=None)
    reply = llm.invoke(prompt)


    return reply.content, retrieved_docs




In [5]:
def extract_or_fetch_indexes(text, db, query, k, gemini_api_key):
    # Try extracting indexes using the regular expressions
    indexes = extract_indexes(text)

    # If the indexes list is empty, contact Gemini to get the indexes
    if not indexes:
        print('Extracting indexes using Gemini... as the extracted indexes list is empty')
        # Prepare the query for Gemini to retrieve indexes
        example_text = """
        For example, if the text is:
        {
            "Content": "Some medical content...",
            "Metadata": {
                "DoctorAnswer": "Sample doctor answer...",
                "index": 123
            }
        },
        {
            "Content": "Another piece of content...",
            "Metadata": {
                "DoctorAnswer": "Another answer...",
                "index": 456
            }
        }
        You should return: 123, 456
        """

        gemini_query = f"""Please extract all index numbers from the following text and return them as a comma-separated list. 
        {example_text}
        
        Here is the text you need to analyze:
        {text}
        """

        # Get the response from Gemini
        llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.1, google_api_key=gemini_api_key,
                                     safety_settings=None)
        reply = llm.invoke(gemini_query)

        # Parse the response to get the list of indexes
        indexes = parse_gemini_indexes(reply.content)

    return indexes


def extract_indexes(text):
    # Regular expression to find all occurrences of 'index' or "index": followed by a number
    index_pattern_single = r"'index':\s*(\d+)"
    index_pattern_double = r'"index":\s*(\d+)'

    # Find all matches in the string for both patterns
    matches_single = re.findall(index_pattern_single, text)
    matches_double = re.findall(index_pattern_double, text)

    # Combine all matches
    matches = matches_single + matches_double

    # Convert matches to a list of integers
    indexes = list(map(int, matches))

    return indexes


def parse_gemini_indexes(gemini_response):
    # Parse the Gemini response assuming it returns a comma-separated list of numbers
    index_list = [int(i.strip()) for i in gemini_response.split(',') if i.strip().isdigit()]
    return index_list


# Run Gemini Response Experiments

In [None]:


Experiment_Df_backup = []
exp1_df = []
setOfIndexes = set()  
setOfIndexesReplied = set()  

for RowNumber, row in Vectors_df.iterrows():
    print(f'Index: {RowNumber}')
    NumberOfUniqueIndexesAdded = 0
    IndexesRetrieved = []
    IndexesAddedUnique = []
    IndexesAddedUniqueCosineSimilarity = []
    IndexesCosineSimilarity = []
    IndexesReplied = []
    IndexesRepliedCosineSimilarity = []
    IndexesDuplicateReplied = []
    IndexesDuplicatedCount = 0
    HallucinatedIndexes = []

    try:
        query = row['Perturbed Sentence']
    except:
        query = row['perturbed_sentence']
    reply, retrieved_docs = GetLLMResponse(db, query, k, GEMINI_API_KEY)
    for doc in retrieved_docs:
        if doc[0].metadata["index"] not in setOfIndexes:
            NumberOfUniqueIndexesAdded += 1
            setOfIndexes.add(doc[0].metadata["index"])
            IndexesAddedUnique.append(doc[0].metadata["index"])
            IndexesAddedUniqueCosineSimilarity.append(doc[1])

        IndexesRetrieved.append(doc[0].metadata["index"])
        IndexesCosineSimilarity.append(doc[1])

    CurrentIndexListFromReply = extract_indexes(reply)

    for CurrentDocindex in CurrentIndexListFromReply:
        if CurrentDocindex not in IndexesRetrieved:  #Hallucinated Index
            HallucinatedIndexes.append(CurrentDocindex)

        else:  #Replied Index
            if CurrentDocindex not in IndexesReplied:
                IndexesReplied.append(CurrentDocindex)
                DocCosine = IndexesCosineSimilarity[IndexesRetrieved.index(CurrentDocindex)]
                IndexesRepliedCosineSimilarity.append(DocCosine)
                setOfIndexesReplied.add(CurrentDocindex)
            else:
                IndexesDuplicateReplied.append(CurrentDocindex)
                IndexesDuplicatedCount += 1

    exp1_df.append([RowNumber, query, reply, IndexesRetrieved, IndexesCosineSimilarity, NumberOfUniqueIndexesAdded,
                    IndexesAddedUnique, IndexesAddedUniqueCosineSimilarity, setOfIndexes.copy(), IndexesReplied,
                    IndexesRepliedCosineSimilarity, IndexesDuplicateReplied, IndexesDuplicatedCount,
                    HallucinatedIndexes, setOfIndexesReplied.copy()])


    print('----------------------------------------')

    if RowNumber == 0:
        Experiment_Df_backup = pd.DataFrame([[RowNumber, query, reply, IndexesRetrieved, IndexesCosineSimilarity,
                                              NumberOfUniqueIndexesAdded, IndexesAddedUnique,
                                              IndexesAddedUniqueCosineSimilarity, setOfIndexes.copy(), IndexesReplied,
                                              IndexesRepliedCosineSimilarity, IndexesDuplicateReplied,
                                              IndexesDuplicatedCount, HallucinatedIndexes, setOfIndexesReplied.copy()]],
                                            columns=['Index', 'Query', 'Reply', 'IndexesRetrieved',
                                                     'IndexesCosineSimilarity', 'NumberOfUniqueIndexesAdded',
                                                     'IndexesAddedUnique', 'IndexesAddedUniqueCosineSimilarity',
                                                     'SetOfIndexes', 'IndexesReplied', 'IndexesRepliedCosineSimilarity',
                                                     'IndexesDuplicateReplied', 'IndexesDuplicatedCount',
                                                     'HallucinatedIndexes', 'SetOfIndexesReplied'])


    else:
        new_row = pd.DataFrame([[RowNumber, query, reply, IndexesRetrieved, IndexesCosineSimilarity,
                                 NumberOfUniqueIndexesAdded, IndexesAddedUnique, IndexesAddedUniqueCosineSimilarity,
                                 setOfIndexes.copy(), IndexesReplied, IndexesRepliedCosineSimilarity,
                                 IndexesDuplicateReplied, IndexesDuplicatedCount, HallucinatedIndexes,
                                 setOfIndexesReplied.copy()]],
                               columns=['Index', 'Query', 'Reply', 'IndexesRetrieved', 'IndexesCosineSimilarity',
                                        'NumberOfUniqueIndexesAdded', 'IndexesAddedUnique',
                                        'IndexesAddedUniqueCosineSimilarity', 'SetOfIndexes', 'IndexesReplied',
                                        'IndexesRepliedCosineSimilarity', 'IndexesDuplicateReplied',
                                        'IndexesDuplicatedCount', 'HallucinatedIndexes', 'SetOfIndexesReplied'])

        Experiment_Df_backup = pd.concat([Experiment_Df_backup, new_row], ignore_index=True)

    Experiment_Df_backup.to_csv(f'{SavePath}/LLM_Reply_Backup.csv', index=False)

exp1_df = pd.DataFrame(exp1_df, columns=['Index', 'Query', 'Reply', 'IndexesRetrieved', 'IndexesCosineSimilarity',
                                         'NumberOfUniqueIndexesAdded', 'IndexesAddedUnique',
                                         'IndexesAddedUniqueCosineSimilarity', 'SetOfIndexes', 'IndexesReplied',
                                         'IndexesRepliedCosineSimilarity', 'IndexesDuplicateReplied',
                                         'IndexesDuplicatedCount', 'HallucinatedIndexes', 'SetOfIndexesReplied'])
exp1_df.to_csv(f'{SavePath}/LLM_Reply_Full.csv', index=False)





