# Pipeline for Hyperparameter Selection

**TABLE OF CONTENTS**
1. Imports
2. Auxiliary functions for retrieval and chains
3. Retrieval
    * a) Loading the embedding for vectorstores
    * b) Loading the vectorstores  
4. Hyperparameter selection
    * a) Hyperparameters
    * b) Documenting retrieval
    * c) Inference

## 1. Imports

In [1]:
from huggingface_hub import login
login(token = hf_logging_token)  # UPDATE THE TOKEN FOR RUNNING THE NOTEBOOK
#from langchain.chains import RetrievalQA, LLMChain
from langchain_core.runnables import RunnablePassthrough
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.vectorstores import FAISS
import os
os.environ['OPENAI_API_KEY'] = openAI_API_key  # UPDATE THE KEY BEFORE RUNNING THE NOTEBOOK 
import pickle
import time
from transformers import AutoModel 

  from .autonotebook import tqdm as notebook_tqdm


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/Onema/.cache/huggingface/token
Login successful


## 2. Auxiliary functions for retrieval and chain

In [2]:
# Auxiliary functions for the chain

def pretty_metadata(metadata_dict):
    return metadata_dict["autor"]+", "+metadata_dict["source"]+", ed. "+str(metadata_dict["edition"])+" ("+str(metadata_dict["date"])+"), "+metadata_dict["in-text location"]

# NOTE: The following 3 function are esentially variations of the same

def format_docs_no_print(docs):  # taken from: https://python.langchain.com/docs/use_cases/question_answering/sources
    return "\n\n".join(doc.page_content for doc in docs)

def format_docs_and_print(docs):
    returned_sources = []
    print("Retrieved context (metadata are not passed to the model):\n")
    for doc in docs:
        print(doc.page_content+"\n"+"Metadata: "+pretty_metadata(doc.metadata)+"\n")
        returned_sources.append(doc.page_content)
    return "\n\n".join(returned_sources)

def format_docs_for_documentation(docs):
    returned_sources = []
    for doc in docs:
        returned_sources.append(doc.page_content+"\n"+"Metadata: "+pretty_metadata(doc.metadata)+"\n")
    return "\n\n".join(returned_sources)

## 3. Retrieval 

### a) Loading the embedding for vectorstores

In [3]:
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True) 
model_name = "jinaai/jina-embeddings-v2-base-de"
model_kwargs = {'device': 'cpu'}
hf_jina = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs)

  return self.fget.__get__(instance, owner)()


### b) Loading the vectorstores

In [4]:
dir_path = "../data/vectorDB"

db_700_without_reg = FAISS.load_local(dir_path+"/"+"faiss_vecDB_700_without_reg", hf_jina, allow_dangerous_deserialization = True)
db_1500_without_reg = FAISS.load_local(dir_path+"/"+"faiss_vecDB_1500_without_reg", hf_jina, allow_dangerous_deserialization = True)

## 5. Hyperparameters selection

### a) Hyperparameters

In [5]:
# model name
model_name = "gpt-3.5-turbo-0125"

# hyperparameters (3x2x3 = 12 model versions)
selected_temps = [0, 0.2, 0.5]
vectorstore_ids = [700, 1500]   # strings 700 & 1500 act as ids
no_retrieved_chunks = [3, 5, 7]

# questions for hyperparameter evaluation
questions = [(2, "Which animals are the most beautiful?"),
            (3, "Which body parts of birds make them capable of flight?"),
            (5, "Is there a connection between climate and animal coloration?")]
            # question numbering the same as in the notebook for embedding evaluation

# path for documentation
save_dir = "../data/results/hyperparameter_selection"


### b) Documenting the retrieval

In [9]:
# prepare txt files for documenting retrieval
for i, question in questions:
    for vectorstore_id in vectorstore_ids:
        with open(f"{save_dir}/context_{vectorstore_id}_Q{i}.txt", "w") as f:
            f.write(f"QUESTION {i}: {question} (using db_{vectorstore_id}_without_reg)\n\n")

In [10]:
# hyperparameter: number of retrieved chunks
for n in no_retrieved_chunks:
    
    # hyperparameter: length of chunks 
    for vectorstore_id in vectorstore_ids:

        if vectorstore_id == 700:
            retriever = db_700_without_reg.as_retriever(search_kwargs={"k": n})

        elif vectorstore_id == 1500:
            retriever = db_1500_without_reg.as_retriever(search_kwargs={"k": n})
            
        else:
            print("Error. Retriever is not specified")
        
        for i, question in questions:
            
            # retrieve           
            identified_docs = retriever.get_relevant_documents(question)
            
            with open(f"{save_dir}/context_{vectorstore_id}_Q{i}.txt", "a") as f:
                f.write(f"\n============================================\nk = {n}\n\n")
                f.write(format_docs_for_documentation(identified_docs))


### c) Inference

In [6]:
template = """Answer the QUESTION based solely on the provided CONTEXT. Explain your answer. If CONTEXT doesn't contain sufficient information to answer the question, say you that you don't know the answer.
QUESTION: {question}
CONTEXT: {context}
Answer: """

prompt = PromptTemplate.from_template(template)

In [7]:
# prepare txt files for documenting inference
for i, question in questions:
    with open(f"{save_dir}/generation_Q{i}.txt", "w") as f:
            f.write(f"QUESTION {i}: {question}\n\n")

In [30]:
# hyperparameter: temperature
for temp in selected_temps:
    
    llm = ChatOpenAI(model_name=model_name, temperature=temp, max_tokens = 400)   # 400 tokens is about 300 words
    
    # hyperparameter: number of retrieved chunks
    for n in no_retrieved_chunks:
    
        # hyperparameter: length of chunks 
        for vectorstore_id in vectorstore_ids:

            if vectorstore_id == 700:
                retriever = db_700_without_reg.as_retriever(search_kwargs={"k": n})

            elif vectorstore_id == 1500:
                retriever = db_1500_without_reg.as_retriever(search_kwargs={"k": n})
            
            else:
                print("Error. Retriever is not specified")
                
            
            # RAG chain
            chain = (
                    {"context": retriever | format_docs_no_print, "question": RunnablePassthrough()}
                    | prompt
                    | llm
                        )
            
            # try the model given its hyperparapers for each of the questions
            for i, question in questions:
                inference = chain.invoke(question)
            
                # document the result
                hp_details = f"{model_name}, temperature = {temp}, chunk size = {vectorstore_id}, no. retrieved chunks = {n}"

                with open(f"{save_dir}/generation_Q{i}.txt", "a") as f:
                    f.write(hp_details)
                    f.write("\n")
                    f.write(inference.pretty_repr())
                    f.write("\n\n\n\n")
                
                print("Processed ", hp_details)
                time.sleep(30) # avoid too many requests of the API
                                    

Processed  gpt-3.5-turbo-0125, temperature = 0, chunk size = 700, no. retrieved chunks = 3
Processed  gpt-3.5-turbo-0125, temperature = 0, chunk size = 700, no. retrieved chunks = 3
Processed  gpt-3.5-turbo-0125, temperature = 0, chunk size = 700, no. retrieved chunks = 3
Processed  gpt-3.5-turbo-0125, temperature = 0, chunk size = 1500, no. retrieved chunks = 3
Processed  gpt-3.5-turbo-0125, temperature = 0, chunk size = 1500, no. retrieved chunks = 3
Processed  gpt-3.5-turbo-0125, temperature = 0, chunk size = 1500, no. retrieved chunks = 3
Processed  gpt-3.5-turbo-0125, temperature = 0, chunk size = 700, no. retrieved chunks = 5
Processed  gpt-3.5-turbo-0125, temperature = 0, chunk size = 700, no. retrieved chunks = 5
Processed  gpt-3.5-turbo-0125, temperature = 0, chunk size = 700, no. retrieved chunks = 5
Processed  gpt-3.5-turbo-0125, temperature = 0, chunk size = 1500, no. retrieved chunks = 5
Processed  gpt-3.5-turbo-0125, temperature = 0, chunk size = 1500, no. retrieved chunk