# Configuration

In [1]:
import os
import joblib
from ast import literal_eval
from math import floor
from typing import List, Dict, Optional

import numpy as np
import pandas as pd

import dotenv
dotenv.load_dotenv()

import weaviate
import weaviate.classes as wvc
from weaviate.collections.classes.config import (
    Property, DataType
)

from langchain_ollama import OllamaLLM
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from llmlingua import PromptCompressor
from jinja2 import Template

from ragas.testset import TestsetGenerator
from ragas import evaluate, RunConfig
from ragas.metrics import LLMContextRecall, LLMContextPrecisionWithReference, LLMContextPrecisionWithoutReference, AnswerRelevancy, AnswerCorrectness, AnswerSimilarity, Faithfulness
from datasets import Dataset  

from langchain_community.embeddings import SentenceTransformerEmbeddings

In [2]:
embedding_model_path = os.getenv("ENCODER_MODEL")
llm_name = os.getenv("LLM")
prompts_folder = os.getenv("PROMPTS_FOLDER")

In [3]:
embedding_model = SentenceTransformerEmbeddings(model_name=embedding_model_path, model_kwargs={"trust_remote_code":True, 'device': 'cuda'})
compressor = PromptCompressor(model_name='microsoft/llmlingua-2-xlm-roberta-large-meetingbank', use_llmlingua2=True)
wv_client = weaviate.connect_to_local()

  embedding_model = SentenceTransformerEmbeddings(model_name=embedding_model_path, model_kwargs={"trust_remote_code":True, 'device': 'cuda'})
<All keys matched successfully>


In [4]:
llm = OllamaLLM(
    model=llm_name,
    temperature=0,
    base_url=f"http://localhost:11434"
)

# Load data

In [80]:
class BooksProcessor:
    def __init__(self, wv_client, embedding_model):
        self.embedding_model = embedding_model
        self.wv_client = wv_client

    def create_collection_if_not_exists(self, collection_name):
        if not self.wv_client.collections.exists(collection_name):
            self.wv_client.collections.create(
                name=collection_name,
                properties=[
                    Property(name="chunk", data_type=DataType.TEXT),
                    Property(name="book_name", data_type=DataType.TEXT),
                    Property(name="chunk_num", data_type=DataType.INT)
                ],
                #vectorizer_config=wvc.config.Configure.Vectorizer.none()
                #vectorizer_config=[
                    #Configure.NamedVectors.text2vec_ollama(
                    #    name="book_vectorizer",
                    #    source_properties=["book_chunks"],
                    #    api_endpoint="http://ollama:11434",
                    #    model=self.embedding_model_name,
                    #    vector_index_config=Configure.VectorIndex.hnsw(
                    #        distance_metric=VectorDistances.COSINE
                    #    )
                    #)
                #]
            )
        return self.wv_client.collections.get(collection_name)

    def split_book(self, book_text, chunk_size, chunk_overlap):
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        return [i.page_content for i in splitter.create_documents([book_text])]

    def process_book(self, book_name, book_txt):
        if self.wv_client.collections.exists(book_name + '_medium_chunks'):
            print("Book already exists")
            return
        chunk_configs = [
        #    ('_big_chunks', 3000, 1000),
            ('_medium_chunks', 1000, 100),
        #    ('_small_chunks', 400, 50)
        ]
        
        for suffix, chunk_size, overlap in chunk_configs:
            collection = self.create_collection_if_not_exists(book_name + suffix)
            chunks = self.split_book(book_txt, chunk_size, overlap)
            embeddings = self.embedding_model.embed_documents(['search_document: ' + i for i in chunks])
            question_objs = []

            for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
                question_objs.append(wvc.data.DataObject(
                    properties= {
                        "chunk": chunk,
                        "book_name": book_name,
                        "chunk_num": i
                    },
                    vector=embedding
                ))
            collection.data.insert_many(question_objs)

    def delete_book(self, book_name: str) -> None:
        """
        Delete all collections associated with a book.
        """
        for suffix in ['_big_chunks', '_medium_chunks', '_small_chunks']:
            collection_name = book_name + suffix
            if self.wv_client.collections.exists(collection_name):
                try:
                    self.wv_client.collections.delete(collection_name)
                except Exception as e:
                    print(f"Error deleting collection {collection_name}: {e}")
        print(f"Successfully deleted collections for {book_name}")

In [6]:
# processor.delete_book('Sherlock_Study_in_Scarlet')

In [7]:
processor = BooksProcessor(wv_client, embedding_model)
with open('Sherlock_Study_in_Scarlet.txt', 'r', encoding='utf8') as file:
    text = file.read()
processor.process_book('Sherlock_Study_in_Scarlet', text)
#processor.delete_book('Sherlock_Study_in_Scarlet')

Book already exists


In [8]:
collection_type = '_medium_chunks'
book_name='Sherlock_Study_in_Scarlet'
book = wv_client.collections.get(book_name + collection_type)

In [9]:
docs = []

for item in book.iterator():
    docs.append(Document(metadata={"file_name": item.properties['book_name']},
                         page_content=item.properties['chunk']))

# RAG System

In [127]:
class Search:
    def __init__(self, wv_client, embedding_model):
        self.embedding_model = embedding_model
        self.wv_client = wv_client
        self.multiplier_mapping = {'_big_chunks': 0.7, '_medium_chunks': 1, '_small_chunks': 1.9}
        #self._load_prompt_template()

    def search(self, query, book_name):
        collection_type = '_medium_chunks'
        book = self.wv_client.collections.get(book_name + collection_type)
        
        total_count = book.aggregate.over_all(total_count=True).total_count
        chunks_to_retrieve = floor(np.maximum(self.multiplier_mapping[collection_type] * np.log(total_count), 1))
        
        embedding = self.embedding_model.embed_query('search_query: ' + query)
        response = book.query.near_vector(near_vector=list(embedding), limit=chunks_to_retrieve, return_metadata=wvc.query.MetadataQuery(certainty=True))
        relevant_chunks = response.objects#sorted(response.objects, key=lambda x: x.properties['chunk_num'])
        relevant_text = '\n'.join([f"\nCHUNK {i.properties['chunk_num']}\n" + i.properties['chunk'].strip() for i in relevant_chunks])
        return relevant_text

class RAGSystem:
    def __init__(self, wv_client, embedding_model, compressor, llm_name, prompts_folder, compression_rate=0.75):
        self.embedding_model = embedding_model
        self.searcher = Search(wv_client, self.embedding_model)
        self.compression_rate = compression_rate
        self.compressor = compressor
        self.llm = OllamaLLM(
            model=llm_name,
            temperature=0,
            base_url=f"http://localhost:11434"
        )
        with open(os.path.join(prompts_folder, 'final_prompt.j2')) as f:
            self._template = f.read()

    def query(self, query: str, book_names: List[str], 
             dialogue_history: Optional[List[Dict[str, str]]] = None) -> str:
        dialogue_history = dialogue_history or []
        compressed_contexts = []
        
        for book_name in book_names:
            context = self.searcher.search(query, book_name)
            if context:
                compressed = self.compressor.compress_prompt(
                    context,
                    rate=self.compression_rate,
                    force_tokens=['\n', '?', '.', '!', 'CHUNK']
                )['compressed_prompt']
                compressed_contexts.append(f"From {book_name}:\n{compressed}")
        
        if not compressed_contexts:
            return "No relevant information found."

        final_prompt = Template(self._template).render(
            contexts=compressed_contexts,
            dialogue_history=dialogue_history,
            query=query
        )
        
        return compressed_contexts, self.llm.invoke(final_prompt)

In [128]:
rag = RAGSystem(wv_client, embedding_model, compressor, llm_name=llm_name, prompts_folder=prompts_folder)

# Ragas

## Generate

In [5]:
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
# generator_llm = LangchainLLMWrapper(llm)
generator_embeddings = LangchainEmbeddingsWrapper(embedding_model)

In [7]:
testsetgenerator = TestsetGenerator(generator_llm, embedding_model=generator_embeddings)

In [12]:
# testsetgenerator_.knowledge_graph = joblib.load('testsetgenerator_kg.pkl')
# testsetgenerator_.persona_list = joblib.load('testsetgenerator_pl.pkl')

In [8]:
testsetgenerator

TestsetGenerator(llm=LangchainLLMWrapper(langchain_llm=ChatOpenAI(...)), embedding_model=LangchainEmbeddingsWrapper(embeddings=HuggingFaceEmbeddings(...)), knowledge_graph=KnowledgeGraph(nodes: 0, relationships: 0), persona_list=None)

In [None]:
test_size = 100
testset = testsetgenerator.generate_with_langchain_docs(docs, testset_size=test_size)

In [None]:
testset_pd = testset.to_pandas()
testset_pd.head()

In [None]:
testset_pd.to_csv('synthetic_data_ragas_4o-mini.csv', index=False)

In [None]:
# joblib.dump(testsetgenerator.knowledge_graph, 'testsetgenerator_kg_llama_4o-mini.pkl')

In [None]:
# joblib.dump(testsetgenerator.persona_list, 'testsetgenerator_pl_llama_4o-mini.pkl') 

## Eval

In [115]:
testset_pd = pd.read_csv('synthetic_data_ragas_4o-mini.csv', converters={'reference_contexts': literal_eval})

In [116]:
testset_pd['reference_contexts'] = testset_pd['reference_contexts'].apply(lambda lst: [s[38:] for s in lst])

In [117]:
evalset_pd = pd.DataFrame({'user_input':testset_pd['user_input'],
                           'retrieved_contexts': [''] * len(testset_pd),
                           'reference_contexts': testset_pd['reference_contexts'],
                           'response': [''] * len(testset_pd),
                           'reference': testset_pd['reference']})

In [None]:
evalset_pd[['retrieved_contexts', 'response']] = evalset_pd.apply(lambda row: rag.query(
    query=row['user_input'],
    book_names=['Sherlock_Study_in_Scarlet'],
    dialogue_history=[]
), axis=1, result_type ='expand')

In [155]:
# evalset_pd.to_csv('synthetic_full_data_ragas_4o-mini_llama-3.2.csv', index=False)

In [4]:
evalset_pd = pd.read_csv('synthetic_full_data_ragas_4o-mini_llama-3.2.csv', converters={'retrieved_contexts': literal_eval, 'reference_contexts': literal_eval})

In [15]:
run_config = RunConfig(timeout=120, max_wait = 180, max_workers= 1)

In [None]:
score_result = evaluate(
  dataset=Dataset.from_pandas(evalset_pd),
  metrics=[
      LLMContextRecall(llm=generator_llm),
      LLMContextPrecisionWithReference(llm=generator_llm),
      LLMContextPrecisionWithoutReference(llm=generator_llm),
      AnswerCorrectness(llm=generator_llm, embeddings=generator_embeddings),
      AnswerRelevancy(llm=generator_llm, embeddings=generator_embeddings),
      AnswerSimilarity(embeddings=generator_embeddings),
      Faithfulness(llm=generator_llm)
  ]
)

In [18]:
score_result

{'context_recall': 0.7348, 'llm_context_precision_with_reference': 0.9796, 'llm_context_precision_without_reference': 0.9400, 'answer_correctness': 0.5003, 'answer_relevancy': 0.6371, 'semantic_similarity': 0.8422, 'faithfulness': 0.8400}

In [39]:
scoreset_pd = score_result.to_pandas()

In [40]:
# scoreset_pd.to_csv('synthetic_score_data_ragas_4o-mini_llama-3.2.csv', index=False)

In [7]:
scoreset_pd = pd.read_csv('synthetic_score_data_ragas_4o-mini_llama-3.2.csv', converters={'retrieved_contexts': literal_eval, 'reference_contexts': literal_eval})

In [39]:
filter_scoreset_pd = scoreset_pd[(scoreset_pd.iloc[:, 5:] > 0.5).mean(axis=1) == 1].reset_index(drop=True).copy()

In [42]:
print('Len good dataset:', len(filter_scoreset_pd))

Len good dataset: 32


In [40]:
scoreset_pd.iloc[:, 5:].mean()

context_recall                             0.734760
llm_context_precision_with_reference       0.979592
llm_context_precision_without_reference    0.940000
answer_correctness                         0.500302
answer_relevancy                           0.637103
semantic_similarity                        0.842173
faithfulness                               0.840015
dtype: float64

In [41]:
filter_scoreset_pd.iloc[:, 5:].mean()

context_recall                             0.908073
llm_context_precision_with_reference       1.000000
llm_context_precision_without_reference    1.000000
answer_correctness                         0.651239
answer_relevancy                           0.845944
semantic_similarity                        0.882017
faithfulness                               0.894918
dtype: float64

In [34]:
def print_score_pd(scoreset_pd, idx):
    print('user_input:', scoreset_pd.iloc[idx]['user_input'], '\n')
    print('retrieved_contexts:', scoreset_pd.iloc[idx]['retrieved_contexts'], '\n')
    print('reference_contexts:', scoreset_pd.iloc[idx]['reference_contexts'], '\n')
    print('response:', scoreset_pd.iloc[idx]['response'], '\n')
    print('reference:', scoreset_pd.iloc[idx]['reference'], '\n')
    
    print('context_recall =', scoreset_pd.iloc[idx]['context_recall'])
    print('llm_context_precision_with_reference =', scoreset_pd.iloc[idx]['llm_context_precision_with_reference'])
    print('llm_context_precision_without_reference =', scoreset_pd.iloc[idx]['llm_context_precision_without_reference'])
    print('answer_correctness =', scoreset_pd.iloc[idx]['answer_correctness'])
    print('answer_relevancy =', scoreset_pd.iloc[idx]['answer_relevancy'])
    print('semantic_similarity =', scoreset_pd.iloc[idx]['semantic_similarity'])
    print('faithfulness =', scoreset_pd.iloc[idx]['faithfulness'])

In [35]:
print_score_pd(scoreset_pd, 99)

user_input: What can we learn about the character Hope and the themes of hope and sacrifice in the context of the murder case involving Enoch Drebber and Joseph Stangerson? 


reference_contexts: ['“The public,” it said, “have lost a sensational treat through the\nsudden death of the man Hope, who was suspected of the murder of Mr.\nEnoch Drebber and of Mr. Joseph Stangerson. The details of the case\nwill probably be never known now, though we are informed upon good\nauthority that the crime was the result of an old standing and romantic\nfeud, in which love and Mormonism bore a part. It seems that both the\nvictims belonged, in their younger days, to the Latter Day Saints, and\nHope, the deceased prisoner, hails also from Salt Lake City. If the\ncase has had no other effect, it, at least, brings out in the most\nstriking manner the efficiency of our detective police force, and will\nserve as a lesson to all foreigners that they will do wisely to settle\ntheir feuds at home, and not to

In [53]:
print_score_pd(scoreset_pd, 5)

user_input: How does the reference to the Roman miser in 'A Study in Scarlet' reflect Sherlock Holmes' perspective on success? 

retrieved_contexts: ['From Sherlock_Study_in_Scarlet:\n\nCHUNK 330\n “Didn’t I tell so when started?” cried Sherlock Holmes\n laugh. result of our Study in Scarlet get\n testimonial!”\n\n “Never mind I answered “I have facts in my journal\n public shall know them. make yourself\n contented by consciousness of success like Roman miser—\n\n\n “‘Populus me sibilat plaudo\n Ipse domi simul nummos contemplor in arca.\n\n\n\n\n\n\n END OF PROJECT GUTENBERG EBOOK A STUDY IN SCARLET ***\n\n\n\n\n Updated editions replace previous old editions\n renamed.\n\nCHUNK\n Project Gutenberg eBook of A Study in Scarlet\n\n ebook for use of anyone in United States\n other parts of world at no cost almost no restrictions\n. may copy give away re-use under terms\n Project Gutenberg License online\n at www. gutenberg. org. If not located in United States\n check laws of country\n 