## Prerequisites

In [58]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from datasets import Dataset, Features, Sequence, Value
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_similarity,
  
)
from ragas import evaluate 
import json
import datasets
import os
import glob
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets


pd.set_option("display.max_colwidth", None)

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument
from transformers import AutoTokenizer
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import VectorStore
from langchain_core.language_models.llms import LLM
from langchain_community.llms import HuggingFaceHub
from langchain_community.document_loaders import PyPDFLoader
from langchain.llms import OpenAI

In [3]:
# Import the load_dotenv function from the dotenv module
from dotenv import load_dotenv

# Call the load_dotenv function to load environment variables from a .env file
load_dotenv()

os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv("HUGGINGFACEHUB_API_TOKEN")
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

In [4]:
class RAG_pipeline:

    def __init__(self, data_dir_path: str, chunk_size: int):
        self.data_dir_path = data_dir_path
        self.load_documents(self.data_dir_path)
        self.chunk_size = chunk_size
        self.RAG_PROMPT_TEMPLATE = """
            <|system|>
            Using the information contained in the context,
            give a comprehensive answer to the question.
            Respond only to the question asked, response should be concise and relevant to the question.
            Provide the number of the source document when relevant.
            If the answer cannot be deduced from the context, do not give an answer.</s>
            <|user|>
            Context:
            {context}
            ---
            Now here is the question you need to answer.

            Question: {question}
            </s>
            <|assistant|>
        """
    
    def load_documents(self, data_dir_path: str):
        docs = []
        for file_path in glob.glob(data_dir_path + "/*.pdf"):
            loader = PyPDFLoader(file_path)
            pages = loader.load_and_split()
            docs.extend(pages)

        self.knowledge_base = [
            LangchainDocument(page_content=page.page_content, metadata=page.metadata) for page in tqdm(docs)]

    def split_documents(self, tokenizer_name: str) -> List[LangchainDocument]:
        """
        Split documents into chunks of size `chunk_size` characters and return a list of documents.
        """
        text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
            AutoTokenizer.from_pretrained(tokenizer_name),
            chunk_size=self.chunk_size,
            chunk_overlap=int(self.chunk_size / 10),
            add_start_index=True,
            strip_whitespace=True,
            separators=["\n\n", "\n", ".", " ", ""],
        )

        docs_processed = []
        for doc in self.knowledge_base:
            docs_processed += text_splitter.split_documents([doc])

        # Remove duplicates
        unique_texts = {}
        docs_processed_unique = []
        for doc in docs_processed:
            if doc.page_content not in unique_texts:
                unique_texts[doc.page_content] = True
                docs_processed_unique.append(doc)
    
        return docs_processed_unique
    
    def load_embeddings(self,
        embedding_model_name: Optional[str] = "thenlper/gte-small") -> FAISS:
        """
        Creates a FAISS index from the given embedding model and documents. Loads the index directly if it already exists.

        Args:
            langchain_docs: list of documents
            chunk_size: size of the chunks to split the documents into
            embedding_model_name: name of the embedding model to use

        Returns:
            FAISS index
        """
        # load embedding_model
        embedding_model = HuggingFaceEmbeddings(
            model_name=embedding_model_name,
            multi_process=True,
            model_kwargs={"device": "cpu"},
            encode_kwargs={"normalize_embeddings": True},  # set True to compute cosine similarity
        )

        # Check if embeddings already exist on disk
        index_name = f"index_chunk:{self.chunk_size}_embeddings:{embedding_model_name.replace('/', '~')}"
        index_folder_path = f"./data/indexes/{index_name}/"
        if os.path.isdir(index_folder_path):
            return FAISS.load_local(
                index_folder_path,
                embedding_model,
                distance_strategy=DistanceStrategy.COSINE,
            )

        else:
            print("Index not found, generating it...")
            docs_processed = self.split_documents(
                embedding_model_name,
            )
            knowledge_index = FAISS.from_documents(
                docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
            )
            knowledge_index.save_local(index_folder_path)
            return knowledge_index
        
    def answer_with_rag(self, question: str,
        llm: LLM,
        knowledge_index: VectorStore,
        reranker: Optional[RAGPretrainedModel] = None,
        num_retrieved_docs: int = 30,
        num_docs_final: int = 7) -> Tuple[str, List[LangchainDocument]]:
        """Answer a question using RAG with the given knowledge index."""
        # Gather documents with retriever
        relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
        relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

        # Optionally rerank results
        if reranker:
            relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
            relevant_docs = [doc["content"] for doc in relevant_docs]

        relevant_docs = relevant_docs[:num_docs_final]

        # Build the final prompt
        context = "\nExtracted documents:\n"
        context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

        final_prompt = self.RAG_PROMPT_TEMPLATE.format(question=question, context=context)

        # Redact an answer
        
        answer = llm(final_prompt)

        return answer, relevant_docs

In [5]:
rag_pipeline = RAG_pipeline(data_dir_path="./data", chunk_size=512)

  0%|          | 0/662 [00:00<?, ?it/s]

In [6]:
index = rag_pipeline.load_embeddings()

Index not found, generating it...


In [None]:
from langchain_community.llms import HuggingFaceHub

repo_id = "HuggingFaceH4/zephyr-7b-beta"
READER_MODEL_NAME = "zephyr-7b-beta"

READER_LLM = HuggingFaceHub(
    repo_id=repo_id,
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
)

In [None]:
import google.generativeai as genai
os.environ['GOOGLE_API_KEY'] = os.getenv("GOOGLE_API_KEY")
genai.configure()
model = genai.GenerativeModel('gemini-pro')

In [7]:
READER_LLM = OpenAI()

  warn_deprecated(


In [47]:
synthetic_ques = pd.read_csv("/Users/rahulkushwaha/Desktop/LLM Eval/synthethic_que/Manual QA - Q1.csv")
synthetic_ques2 = pd.read_csv("/Users/rahulkushwaha/Desktop/LLM Eval/synthethic_que/Manual QA - Q2.csv")
synthetic_ques3 = pd.read_csv("/Users/rahulkushwaha/Desktop/LLM Eval/synthethic_que/Manual QA - Q3.csv")
synthetic_ques4 = pd.read_csv("/Users/rahulkushwaha/Desktop/LLM Eval/synthethic_que/Manual QA - Q4.csv")

In [48]:
def generate_ans_context(df,READER_LLM,index):
    df["answer"] = None
    df["contexts"] = None

    for i in range(len(df)):
        response,con = rag_pipeline.answer_with_rag(question = df.loc[i, 'question'], llm= READER_LLM, knowledge_index=index)
        df.loc[i, 'answer'] = response
        df.loc[i, 'contexts'] = con[0]
    return df

In [50]:
df = generate_ans_context(synthetic_ques4,READER_LLM,index)

In [61]:
def RAGAs_eval(df):
    df['contexts'] = df['contexts'].apply(lambda x: x if isinstance(x, list) else [str(x)])
    features = Features({
        'contexts': Sequence(Value('string')), })
    dataset = Dataset.from_pandas(df)
    score = evaluate(dataset,metrics=[context_recall,
    context_precision,
    answer_similarity])
    df = score.to_pandas()
    return df

In [62]:
df = RAGAs_eval(df)

Evaluating:   0%|          | 0/33 [00:00<?, ?it/s]

In [66]:
def huggingface_eval(df):
   import evaluate
   
   df["honesty"]=None
   df["toxicity"]=None
   df["language polarity"]=None

   honest = evaluate.load('honest', 'en')
   toxicity = evaluate.load("toxicity", module_type="measurement")
   regard = evaluate.load("regard")

   for i in range(len(df)): 
      completions = [[df.iloc[i]["answer"]], [df.iloc[i]["answer"]]] 
      groups = ["male", "female"]  
      honest_result = honest.compute(predictions=completions, groups=groups)  
      df.at[i, 'honesty'] = honest_result['honest_score_per_group']

      toxic_results = toxicity.compute(predictions=[df.iloc[i]["answer"]])
      df.at[i, 'toxicity'] = toxic_results['toxicity']

      lp_results = regard.compute(data = [df.iloc[i]["answer"]])
      df.at[i, 'language polarity'] = lp_results['regard']
    
   return df

In [67]:
df = huggingface_eval(df)

Using default facebook/roberta-hate-speech-dynabench-r4-target checkpoint
