In [57]:
%cd VALDOM_LLM_ENGINEERING
!git pull
%cd ..
# https://github.com/NirAndria/VALDOM_LLM_ENGINEERING.git


/users/formation/irtn7ndrmn/llm_engineering/VALDOM_LLM_ENGINEERING
Already up to date.
/users/formation/irtn7ndrmn/llm_engineering


In [58]:
import os 
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import ollama
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from smolagents import tool
import pandas as pd 
from smolagents import CodeAgent, LiteLLMModel, ToolCallingAgent, DuckDuckGoSearchTool
from langchain_core.documents import Document
from typing import List, Optional, Any
from langchain_ollama.llms import OllamaLLM
from langchain_ollama import OllamaEmbeddings
from abc import abstractmethod, ABC


oc = ollama.Client("http://localhost:11435")
model = "deepseek-r1:32b"


In [59]:
doc_list = []
path_file = "VALDOM_LLM_ENGINEERING/data_RAG/processed_table/"
def generate_doc(path_file):
    for file_name in os.listdir(path_file):
        print(file_name)
        doc_list.append(TextLoader(path_file + file_name).load()[0])
        
generate_doc(path_file)

Age of MPs.txt
Age of MPs_Perc.txt
Age of newly elected MPs.txt
Education.txt
Ethnicity.txt
Ethnicity_Perc.txt
Female MPs.txt
Female MPs_Perc.txt
Men and Women.txt
New MPs.txt
New MPs_Perc.txt
New MPs_Tot.txt
Occupations 1997-2015.txt
Occupations 1997-2015_Perc.txt
Occupations 2017.txt
Length of Service.txt
Occupations 2019_.txt


In [60]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n"],  # Prioritize keeping paragraphs together, then sentences, then words.
    chunk_size=500,  # Aim for chunks of around 256 characters
    chunk_overlap=16,  # Overlap chunks by 16 characters to preserve context
)

# Apply the splitter
doc_list = text_splitter.split_documents(doc_list)

In [61]:
from abc import abstractmethod, ABC
from langchain_core.documents import Document
from typing import List, Optional, Any

class RAGInterface(ABC):
    """
    Abstract class defining a generic RAG system. 
    
    This class ensures that all RAG implementations follow a common structure.
    """

    def __init__(self, name: str, knowledge_db: Optional[Any] = None):
        self.name = name  # Identifier for the RAG system
        self.knowledge_db = knowledge_db  # Storage backend (e.g., a vector database)

    @abstractmethod
    def retrieve(self, query: str) -> List[Document]:
        """
        Retrieve relevant contexts from the knowledge_db based on the query.

        Args:
            query (str): The user query.

        Returns:
            List[Document]: Retrieved document chunks.
        """
        pass

    @abstractmethod
    def generate(self, query: str, retrieved_contexts: List[Document]) -> str:
        """
        Generate a response based on the query and retrieved contexts.

        Args:
            query (str): The user query.
            retrieved_contexts (List[Document]): Relevant document chunks.

        Returns:
            str: The generated response.
        """
        pass


In [62]:
from langchain_ollama.llms import OllamaLLM
from langchain_ollama import OllamaEmbeddings

embedding = OllamaEmbeddings(model="mistral")

In [63]:
query_list = []
question_path = "VALDOM_LLM_ENGINEERING/questions/"
for file in os.listdir(question_path):
    if ".txt" in file:
        with open(question_path + file,'r') as f:
            query_list.append(f.read())

qualitative_dataset =  pd.DataFrame({'user_input': query_list})
qualitative_dataset.head()

Unnamed: 0,user_input
0,"From 1997 to two years ago, how has the percen..."
1,What are the possible scenarios for forming a ...
2,Analyzing the attendance of UK Prime Ministers...
3,Can you identify any significant trends in the...


In [64]:
from langchain_core.prompts import PromptTemplate
from langchain_qdrant import QdrantVectorStore, FastEmbedSparse, RetrievalMode
import pandas as pd

# 📝 Define a Structured Prompt for Generation
prompt_template_v0 = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use three sentences maximum and keep the answer concise.
Question: {query} 
Context: {retrieved_contexts} 
Answer:
"""


# Create a prompt template from the string
gen_prompt_v0 = PromptTemplate.from_template(prompt_template_v0)

# 🔎 Define BM25-Based RAG System
def run_rag_on_df(rag: RAGInterface, eval_df: pd.DataFrame) -> pd.DataFrame:
    """
    Runs a RAG system on a dataset and store responses.
    
    Args:
        rag (RAGInterface): The RAG system to evaluate.
        eval_df (pd.DataFrame): DataFrame containing user queries and references.
    
    Returns:
        pd.DataFrame: The updated DataFrame including retrieved contexts and generated responses.
    """
    rag_df = eval_df.copy()

    # Retrieve relevant contexts for each query
    rag_df["retrieved_contexts"] = rag_df["user_input"].apply(rag.retrieve)

    # Generate answers using the retrieved contexts
    rag_df["response"] = rag_df.apply(
        lambda row: rag.generate(row["user_input"], row["retrieved_contexts"]), axis=1
    )

    # Convert retrieved documents to their text content
    rag_df["retrieved_contexts"] = rag_df["retrieved_contexts"].apply(
        lambda rcontexts: [rcontext.page_content for rcontext in rcontexts]
    )

    # Track the RAG implementation used
    rag_df["rag_name"] = rag.name

    return rag_df


# Retrival

In [65]:
from langchain_core.prompts import PromptTemplate
from langchain_qdrant import QdrantVectorStore, FastEmbedSparse, RetrievalMode

# Create a prompt template from the string
gen_prompt_v0 = PromptTemplate.from_template(prompt_template_v0)

# 🔎 Define BM25-Based RAG System
class BM25V0RAG(RAGInterface):
    """
    Sparse Retrieval RAG using BM25 and Qdrant.
    
    - Stores text chunks in Qdrant using BM25 sparse embeddings.
    - Retrieves the top-k relevant chunks based on keyword matching.
    - Uses a language model to generate answers from retrieved contexts.
    """

    def __init__(self, generation_model: OllamaLLM):
        # Initialize BM25 sparse embeddings
        sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25", cache_dir='.')

        # Store documents in Qdrant using sparse retrieval
        knowledge_db = QdrantVectorStore.from_documents(
            doc_list,  # Pre-processed text chunks
            embedding=OllamaEmbeddings(model="mistral"),  # Not used in this mode, but required
            sparse_embedding=sparse_embeddings,  # BM25 embeddings
            location=":memory:",  # Store in-memory (can be changed to persistent storage)
            collection_name="the_miser_sparse_v0",
            retrieval_mode=RetrievalMode.SPARSE,  # Use only sparse retrieval
        )

        # Define model name dynamically
        self.name = f"bm25_v0_{generation_model.model}"
        super().__init__(name=self.name, knowledge_db=knowledge_db)

        # Initialize the LLM and retriever
        self.llm = generation_model
        self.retriever = self.knowledge_db.as_retriever(
            search_type="similarity", search_kwargs={"k": 5}  # Retrieve top 5 matches
        )
        self.gen_prompt = gen_prompt_v0  # Use the structured prompt
    
    def retrieve(self, query: str) -> List[Document]:
        """
        Retrieves relevant documents using BM25 sparse retrieval.

        Args:
            query (str): The user query.

        Returns:
            List[Document]: The most relevant document chunks.
        """
        retrieved_contexts = self.retriever.invoke(query)
        return retrieved_contexts
    
    def generate(self, query: str, retrieved_contexts: List[Document]) -> str:
        """
        Generates a response using the retrieved contexts.

        Args:
            query (str): The user query.
            retrieved_contexts (List[Document]): Retrieved document chunks.

        Returns:
            str: The generated answer.
        """
        # Format retrieved contexts into a single string
        
        format_retrieved_contexts = "\n".join([rc.page_content for rc in retrieved_contexts])
        # Format the query with the retrieved contexts
        augmented_query = self.gen_prompt.format(
            name=self.name,
            query=query,
            retrieved_contexts=format_retrieved_contexts
        )

        # Generate the final response
        response = self.llm.invoke(augmented_query)
        return response

In [66]:
from abc import abstractmethod, ABC

# Initialize the generation model
generation_llm = OllamaLLM(model="deepseek-r1:32b")

# Initialize BM25 RAG system
bm25_v0 = BM25V0RAG(generation_model=generation_llm)
#bm25_v0_qualitative_df = run_rag_on_df(rag=bm25_v0, eval_df=qualitative_dataset)

In [67]:
rag = bm25_v0

# Agent

In [68]:
@tool
def get_current_date() -> datetime.date:
    """
    This is a funciton to retrieve the current date. This tool takes no arguments.
    
    Args:

    Returns:
        A dictionary with today's date as a string in ISO format. It should look like this {'curren_date': AAAA.MM.DD}
    """
    print(f"Agent 'date_retriever' is executing the get_current_date tool.")
    try:
        current_date = datetime.now().date()  # Get the current date
        return current_date  # Return the date object directly
    except Exception as e:
        print(f"Error getting the current date: {str(e)}")
        return None 


In [69]:
@tool
def get_answer_from_rag(query: str) -> str:
    """Answer to a query given the documents given to a RAG
    
    Args:
        query: Query to be answered
        
    Returns:
        The answer of the query.
    """
    try:
        print(1)
        context = rag.retrieve(query)
    except Exception as e:
        print(f"Error : {str(e)}")
        return None
    
    try:
        print(2)
        print(f"query is: {query}")
        print(f"context is: {context}")
        answer = rag.generate(query, context)
        print(3)
    except Exception as e:
        print(f"Error : {str(e)}")
        return None 
    
    return answer 



In [70]:
eng_model = LiteLLMModel(
    model_id="ollama/deepseek-r1:32b", 
    api_base="http://localhost:11434/api/generate", 
    num_ctx=8192
)

date_agent = CodeAgent(
    tools=[get_current_date],#get_current_date,],
    model=eng_model,
    max_steps=10,
    name="date_agent",
    description="It is an agent who can get the current date.",
    additional_authorized_imports=["numpy", "datetime"],
)

RAG_agent = CodeAgent(
    tools=[get_answer_from_rag,],
    model=eng_model,
    max_steps=5,
    name="rag_agent",
    description="It is an agent who answer to any questions given the documents about the Parliament, the House of Commons passed in the RAG.",
)

In [71]:
date_agent.prompt_templates['managed_agent'] = {'task': "You're a helpful agent named '{{name}}' and your main goal is to get the current date.\nYou have been submitted this task by your manager.\n---\nTask:\n{{task}}\n---\nFirst thing you have to do is to check if one of the tools that you have can solve partially or totally the task. Provide short answer.\n\nYour final_answer WILL HAVE to contain these parts:\n### 1. Task outcome (short version):\n### 2. Additional context (if relevant):\n\nPut all these in your final_answer tool, everything that you do not pass as an argument to final_answer will be lost.\nAnd even if your task resolution is not successful, please return as much context as possible, so that your manager can act upon this feedback.",
 'report': "Here is the final answer from your managed agent '{{name}}':\n{{final_answer}}"}

RAG_agent.prompt_templates['managed_agent']['task'] = "You're a helpful agent named '{{name}}' and your main goal is to answer to a question using a RAG. The first thing you have to do is to use the tool named 'get_current_date', its purpose id to retrieve the information from the RAG. If you use an another method, you have to specify it. Don't use internet to get an answer, you have to use the RAG and nothing else.\nYou have been submitted this task by your manager.\n---\nTask:\n{{task}}\n---\nYou're helping your manager solve a wider task: so make sure to not provide a one-line answer, but give as much information as possible to give them a clear understanding of the answer.\n\nYour final_answer WILL HAVE to contain these parts:\n### 1. Task outcome (short version):\n### 2. Task outcome (extremely detailed version):\n### 3. Additional context (if relevant):\n\nPut all these in your final_answer tool, everything that you do not pass as an argument to final_answer will be lost.\nAnd even if your task resolution is not successful, please return as much context as possible, so that your manager can act upon this feedback."


In [72]:
manager_agent = CodeAgent(
    tools=[],
    model=eng_model,
    managed_agents=[date_agent,RAG_agent],
    additional_authorized_imports=["numpy", "pandas","datetime"], # these imports are useful for handling computations and table based data
)

In [73]:
qualitative_dataset.at[0, 'user_input'] = "what was the percentage of female members evolved across the House of Commons since 28 years ago ?"

In [74]:
agent_output = manager_agent.run(qualitative_dataset.at[0, 'user_input'])
print("Final output:")
print(agent_output)

1
2
query is: What is the current percentage of female members in the UK House of Commons?
context is: [Document(metadata={'source': 'VALDOM_LLM_ENGINEERING/data_RAG/processed_table/Female MPs_Perc.txt', '_id': 'f99a79987f8c4d1dbd44c489d05c3c9b', '_collection_name': 'the_miser_sparse_v0'}, page_content='row info : 2005 | LAB | Percentage: 0.276056338028169 | CON | Percentage: 0.0858585858585859 | LD | Percentage: 0.161290322580645 | SNP | Percentage: 0.0 | Other | Percentage: 0.12 | Total | Percentage: 0.196923076923077 | \nrow info : 2010 | LAB | Percentage: 0.313953488372093 | CON | Percentage: 0.160130718954248 | LD | Percentage: 0.12280701754386 | SNP | Percentage: 0.166666666666667 | Other | Percentage: 0.217391304347826 | Total | Percentage: 0.22 |'), Document(metadata={'source': 'VALDOM_LLM_ENGINEERING/data_RAG/processed_table/Female MPs_Perc.txt', '_id': '7e3fc63107d7456abc68b44df90ea418', '_collection_name': 'the_miser_sparse_v0'}, page_content='row info : 1979 | LAB | Percent

1
2
query is: What was the percentage of female members in the UK House of Commons in 1997?
context is: [Document(metadata={'source': 'VALDOM_LLM_ENGINEERING/data_RAG/processed_table/Female MPs_Perc.txt', '_id': '2af42a1c3c3045ff94976dee14e46a0f', '_collection_name': 'the_miser_sparse_v0'}, page_content='row info : 1997 | LAB | Percentage: 0.241626794258373 | CON | Percentage: 0.0787878787878788 | LD | Percentage: 0.0652173913043478 | SNP | Percentage: 0.333333333333333 | Other | Percentage: 0.0416666666666667 | Total | Percentage: 0.184615384615385 | \nrow info : 2001 | LAB | Percentage: 0.230582524271845 | CON | Percentage: 0.0843373493975904 | LD | Percentage: 0.0961538461538462 | SNP | Percentage: 0.2 | Other | Percentage: 0.125 | Total | Percentage: 0.181538461538462 |'), Document(metadata={'source': 'VALDOM_LLM_ENGINEERING/data_RAG/processed_table/Age of newly elected MPs.txt', '_id': 'f0ac2b0c5ffe45f0a939a816a38e22b2', '_collection_name': 'the_miser_sparse_v0'}, page_content='ro

1
2
query is: What is the current percentage of female members in the UK House of Commons?
context is: [Document(metadata={'source': 'VALDOM_LLM_ENGINEERING/data_RAG/processed_table/Female MPs_Perc.txt', '_id': 'f99a79987f8c4d1dbd44c489d05c3c9b', '_collection_name': 'the_miser_sparse_v0'}, page_content='row info : 2005 | LAB | Percentage: 0.276056338028169 | CON | Percentage: 0.0858585858585859 | LD | Percentage: 0.161290322580645 | SNP | Percentage: 0.0 | Other | Percentage: 0.12 | Total | Percentage: 0.196923076923077 | \nrow info : 2010 | LAB | Percentage: 0.313953488372093 | CON | Percentage: 0.160130718954248 | LD | Percentage: 0.12280701754386 | SNP | Percentage: 0.166666666666667 | Other | Percentage: 0.217391304347826 | Total | Percentage: 0.22 |'), Document(metadata={'source': 'VALDOM_LLM_ENGINEERING/data_RAG/processed_table/Female MPs_Perc.txt', '_id': '7e3fc63107d7456abc68b44df90ea418', '_collection_name': 'the_miser_sparse_v0'}, page_content='row info : 1979 | LAB | Percent


KeyboardInterrupt



In [None]:
date_agent(task="get_current_year")