# Needed Packages :

In [None]:
# %pip install uv
# %uv pip install --upgrade langchain langchain-community langchain-chroma
# %uv pip install -qU langchain-groq
# %uv pip install langchain_openai
# %uv pip install --upgrade langchain_huggingface
# %uv pip install --upgrade unstructured openpyxl
# %uv pip install nltk
# %uv pip install --upgrade --quiet langchain sentence_transformers
# %uv pip install xlrd
# %uv pip install xformers
# %uv pip install pdf2image

* NOTE:
    The `embedding model (Jina Embeddings V3)` and the `LLM (Llama 3.2 90b)` are used through API services offered by Jina AI and Groq respictively.
    However, both are open-source and can be downloaded and used locally.

---
---
---

# Imports:

In [1]:
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain_text_splitters import RecursiveCharacterTextSplitter
import nltk
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import JinaEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_community.cache import SQLiteCache
from langchain_core.globals import set_llm_cache
from pdf2image import convert_from_path
import os
import glob
from langchain.embeddings import CacheBackedEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain_core.messages import trim_messages, AIMessage, HumanMessage

from dotenv import load_dotenv

## Imports for the PDF to text conversion using surya-ocr
from PIL import Image
# from surya.ocr import run_ocr
# from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
# from surya.model.recognition.model import load_model as load_rec_model
# from surya.model.recognition.processor import load_processor as load_rec_processor

In [2]:
# ! pip install uv

# # Core LangChain
# ! uv pip install langchain

# # For Ollama integration (embeddings, models)
# ! uv pip install langchain-ollama

# # For Chroma vector database
# ! uv pip install langchain-chroma chromadb

# # For community loaders (Arxiv, PDF, text, etc.)
# ! uv pip install langchain-community

# # For text splitting utilities
# ! uv pip install langchain-text-splitters

# # Core types, prompts, and documents (usually installed with langchain-core)
# ! uv pip install langchain-core

# # Typing extensions (for TypedDict, List, etc.)
# ! uv pip install typing-extensions

# # LangGraph (for StateGraph)@
# ! uv pip install langgraph


In [None]:
from langchain.chat_models import init_chat_model
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import ArxivLoader, TextLoader
from langchain_community.document_loaders.pdf import BasePDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from typing_extensions import TypedDict, List
from langchain_core.documents import Document
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langgraph.graph import StateGraph, START

In [None]:
# llm = init_chat_model(model="gemma3:1b", model_provider="ollama")
embedding = OllamaEmbeddings(model="nomic-embed-text:latest")

In [None]:
# Path to the SQLite database for LLM caching
set_llm_cache(SQLiteCache(database_path=".langchain.db"))


load_dotenv(override=True)

# # API keys
GROQ_API_KEY = os.environ['GROQ_API_KEY']
# JINA_API_KEY = os.environ['JINA_API_KEY']

# Needed downloads for nltk (Only needs to be done once)
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Data Indexing:

## 1- Data Loading

### For Excel 

In [6]:
def load_excel_files(file_paths:list) -> list:
    """
    Load Excel files and return a list of Langchain Documents.

    Parameters:
        file_paths (list): List of file paths to Excel files

    Returns:
        list: A list of Langchain Documents
    """
    loader = UnstructuredExcelLoader(file_paths, mode='elements')
    doc = loader.load()
    return doc

### For txt

In [7]:
def load_text_files(file_path:str):
    """
    Loads a text file and returns a Langchain Document.

    Parameters:
        file_path (str): Path to the text file

    Returns:
        Document: A Langchain Document
    """
    loader = TextLoader(file_path=file_path, encoding='utf-8')
    return loader.load()

## 2-Data Splitting

In [8]:
# split the doc into smaller chunks i.e. chunk_size=512
def split_documents(docs: list, chunk_size=512, chunk_overlap=128) -> list:
    """
    Splits the provided documents into smaller chunks with specified size and overlap.

    Parameters:
        docs (list): List of documents to be split.
        chunk_size (int, optional): The number of characters in each chunk (default is 512).
        chunk_overlap (int, optional): The number of overlapping characters between chunks (default is 128).

    Returns:
        list: A list of split document chunks with corrected metadata.
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(docs)
    
    # Fixing the metadata if something is wrong with it
    for chunk in chunks:
        for key, value in chunk.metadata.items():
            if isinstance(value, list):
                chunk.metadata[key] = ','.join(value)  # Convert list to a comma-separated string
    return chunks

## 3-Data Embedding and data storing

In [None]:
def store_embeddings(chunks: list, CHROMA_PATH="vec_db"):
    """
    Store the given chunks as embeddings in a Chroma database.

    Parameters:
        chunks (list of Document): list of documents to be embedded
        CHROMA_PATH (str, optional): path where the Chroma database is stored (default is "vec_db")

    Returns:
        Chroma: the created Chroma database
    """
    store = LocalFileStore("./emb_cache/")
    embeddings = OllamaEmbeddings(model="nomic-embed-text:latest")
    cached_embedder = CacheBackedEmbeddings.from_bytes_store(embeddings, store, query_embedding_cache=True)
    
    # embed the chunks as vectors and load them into the database
    db_chroma = Chroma.from_documents(chunks, cached_embedder, persist_directory=CHROMA_PATH)
    return db_chroma

# Data Retrieval and Generation

## 1-Retrieval

In [None]:
def retrieve_documents(db_chroma, chunks, query: str, k=50) -> list[tuple]:
    """
    Retrieve context - top k most relevant (closest) chunks to the query vector using an ensemble retriever.
    
    Parameters:
        db_chroma (Chroma): database of embeddings
        chunks (list): List of document chunks.
        query (str or np.ndarray): user query as a string or a vector
        k (int, optional): number of documents to retrieve (default is 50)
        
    Returns:
        list of tuple: list of retrieved documents and their scores
    """
    # Initialize the BM25 retriever
    bm25_retriever = BM25Retriever.from_documents(chunks)
    bm25_retriever.k = k

    # Initialize the Chroma retriever
    chroma_retriever = db_chroma.as_retriever(search_kwargs={"k": k})

    # Initialize the Ensemble Retriever
    ensemble_retriever = EnsembleRetriever(
        retrievers=[bm25_retriever, chroma_retriever], weights=[0.6, 0.4]
    )
    
    docs = ensemble_retriever.invoke(query)
    # The ensemble retriever in langchain doesn't support returning scores, so we return a list of documents
    return docs

## 2-Generation

In [None]:
def generate_answer(docs_chroma: list, query: str, past_messages:str) -> str:
    """
    Generate an answer based on given user query and retrieved context information
    
    Parameters:
        docs_chroma (list of Document): retrieved context information
        query (str): user query
        past_messages (str): past questions and answers
    Returns:
        str: answer to the user query
    """

    # CORRECTED LINE: We are now iterating through a simple list of documents, not tuples.
    context_text = "\n\n".join([
        doc.page_content + "\nFile: " + doc.metadata.get('source', 'Unknown')
        for doc in docs_chroma  # Removed the ", _score" part here
    ])
    
    if past_messages != "":
        context_text = "Past questions and answers:\n\n" + past_messages + "\n\nNew Context for question:\n\n" + context_text
    
    PROMPT_TEMPLATE = """
    You are a helpful data analyst for a company.
    Your goal is to provide correct and accurate answers based on the context provided.
    Provide direct concise answers.
    Don't make up information.
    Mention The source that you based your answer on.
    Make sure your answer is in correct Markdown format.
    If you are going to include LaTeX equations in you answers, **use $$..$$ instead of \[...\]** and make sure it is correct LaTeX.
    
    Context: {context}
    Question: {question}
    
    Answer:
    """

    # load retrieved context and user query in the prompt template
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query)
    
    # call LLM model to generate the answer based on the given context and query
    model = ChatGroq(model="compound-beta", api_key=GROQ_API_KEY)

    # Use the .stream() method to get a streaming response
    stream = model.stream(prompt)
    
    # Yield the content of each chunk from the stream
    for chunk in stream:
        yield chunk.content

In [12]:
def extract_text_from_pdf(pdf_path: str, langs=["en"]) -> str:
    """
    Extracts text from a PDF using Surya OCR and returns it as a single string.

    Parameters:
        pdf_path (str): Path to the PDF file.
        langs (list): List of language codes for OCR (e.g., ["en"] for English).

    Returns:
        str: Extracted text from the PDF.
    """
    # Load Surya models and processors
    det_processor, det_model = load_det_processor(), load_det_model()
    rec_model, rec_processor = load_rec_model(), load_rec_processor()

    # Convert PDF pages to images
    images = convert_from_path(pdf_path)

    # Run OCR on each page and collect text
    extracted_text = ""
    for image in images:
        
        # Perform OCR
        predictions = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor)
        
        # Extract text lines from predictions
        for page in predictions:
            for line in page.text_lines:
                extracted_text += line.text + "\n"

    return extracted_text

# Function to process PDFs using glob

In [13]:
def save_extracted_text_from_pdfs(pdf_path:str, output_dir="temp_results", langs=["en"]) -> str:
    """
    Processes the PDF file at the given path and saves the extracted text to a text file.
    If the text file already exists, it skips processing that PDF.

    Parameters:
        pdf_path (str): Path to the PDF file.
        output_dir (str): Directory to save the extracted text files.
        langs (list): List of language codes for OCR (e.g., ["en"] for English
        
    Returns:
        str: Path to the saved text file
    """
    # Define the output text file path
    text_file_path = os.path.join(output_dir, "/".join(os.path.splitext(pdf_path)[0].split('/')[1:]) + ".txt")
    print(text_file_path)

    # check if the file path provided as an argument exists
    if os.path.exists(text_file_path):
        print(f"Skipping {pdf_path} as output file already exists.")
        return text_file_path
    
    print(f"Processing: {pdf_path}")

    # Extract text from PDF
    extracted_text = extract_text_from_pdf(pdf_path, langs=langs)

    # Create output directory if it does not exist
    os.makedirs(os.path.dirname(text_file_path), exist_ok=True)

    # Save the extracted text
    with open(text_file_path, 'w', encoding='utf-8') as f:
        f.write(extracted_text)

    print(f"Saved extracted text to: {text_file_path}")
    return text_file_path

In [None]:
def call_RAG(query:str, past_messages=[], chat_name=False) -> list:
    """
    Processes a query and past messages, retrieves relevant documents, and generates an answer.

    Parameters:
        query (str): The user query to process and answer.
        past_messages (list): A list of past conversation messages, each being a dictionary with 
            keys "human" and "ai".
        chat_name (bool): Flag indicating whether to generate a chat name based on the query.

    Returns:
        list: A list containing a status code and response content. If chat_name is True, 
        it also includes a generated name for the chat.
    """
    
    # --- 1. NEW: Security Guard Check ---
    # This is now the first step in the pipeline.
    try:
        print("Checking query safety...")
        llm_guard = ChatGroq(model="meta-llama/llama-prompt-guard-2-86m", api_key=GROQ_API_KEY)
        guard_response = llm_guard.invoke(query)
        score = float(guard_response.content)

        # If the score is > 0.5, the query is considered unsafe.
        if score > 0.5:
            print(f"Query flagged as unsafe with score: {score}. Halting process.")
            # Return status code 1 for unsafe query
            return [1, "The query is considered unsafe and will not be processed."]
            
        print("Query is safe, proceeding with RAG pipeline.")
    except Exception as e:
        # Handle cases where the guard model might fail or return unexpected content
        print(f"An error occurred during the safety check: {e}")
        return [1, "Could not verify the safety of the query."]

    # --- 2. Existing RAG Pipeline ---
    # This code only runs if the query passes the safety check.
    
    print(f"Processing query: {query}")
    
    # Get all files in Data and all nested subfolders
    Files = [f for f in glob.glob("Data/**/*", recursive=True) if os.path.isfile(f)]
    
    trimmed_messages = ""
    
    if len(past_messages) > 0:
        messages = []
        for message in past_messages:
            for k, v in message.items():
                if k == "human":
                    messages.append(HumanMessage(v))
                elif k == 'ai':
                    messages.append(AIMessage(v))
                    
        trimmed_messages = trim_messages(
            messages,
            strategy="last",
            token_counter=ChatGroq(model="llama-3.3-70b-versatile", api_key=GROQ_API_KEY),
            max_tokens=5196,
            start_on="human",
            end_on=("human", "tool"),
            include_system=True,
            allow_partial=True,
        )    

        trimmed_messages = "\n".join([t.content for t in trimmed_messages])
        
    # Load files 
    docs = []
    for file in Files:
        if file.lower().endswith(("xlsx", "xls")):
            docs_loaded = load_excel_files(file)
            for doc in docs_loaded:
                doc.metadata['filename'] = os.path.basename(file)
            docs.extend(docs_loaded)
    
        elif file.lower().endswith("txt"):
            docs_loaded = load_text_files(file)
            for doc in docs_loaded:
                doc.metadata['filename'] = os.path.basename(file)
            docs.extend(docs_loaded)
    
        elif file.lower().endswith("pdf"):
            text_path = save_extracted_text_from_pdfs(file)
            text = load_text_files(text_path)
            for doc in text:
                doc.metadata['filename'] = os.path.basename(file)
            docs.extend(text)
    
        else:
            print(f"Unsupported file type: {file}. Supported types are: .xlsx, .xls, .txt, .pdf")
            return [1, "Unsupported file"]

    # Split and store documents
    chunks = split_documents(docs, chunk_size=2048, chunk_overlap=512)
    db_chroma = store_embeddings(chunks)

    # Use the ensemble retriever for hybrid search
    docs_retrieved = retrieve_documents(db_chroma, chunks, query, k=20)
    # for num, doc in enumerate(docs_retrieved):
    #     print(f"\n\nChunk {num}: \n{doc}")
    # print("\n\n\n")
    response_generator = generate_answer(docs_retrieved, query, trimmed_messages)
    
    if chat_name:
        try:
            print("Generating chat name...")
            llm = ChatGroq(model="llama-3.1-8b-instant", api_key=os.environ['GROQ_API_KEY'])
            name_prompt = f"Give me a sentence as a name for this chat if the first question is \"{query}\". Return only the name and nothing else. Limit the name to 15 characters max. Make it readable and understandable."
            answer = llm.invoke(name_prompt)
            name_content = answer.content.strip()
            # Return list with THREE elements on success with chat name
            return [0, response_generator, name_content]
        except Exception as e:
            print(f"Could not generate chat name: {e}. Proceeding without it.")
            # Fallback to returning two elements if name generation fails
            return [0, response_generator]

    # Return list with TWO elements on success without chat name
    return [0, response_generator]

# Testing the RAG pipeline

In [None]:
# lst = call_RAG("how much was the profit for ELM in 2022 and 2023? add them together and return the total profit in a single number")
# print(lst)

[0, '### ELM Company Profit Calculation\nTo calculate the total profit for ELM in 2022 and 2023, we need to find the relevant information in the provided context.\n\n#### Profit for 2023\nThe profit for the nine months period ended September 30, 2023, is mentioned in the file `Q3.txt`:\n- NET PROFIT for the nine months period ended September 30, 2023: 1,029,419,971 Saudi Riyals\n\n#### Profit for 2024 and other years\nThere is no information available about the profit for the full year 2022 or 2023. However, the profit for the nine months period ended September 30, 2022, is not directly available, but we have information for 2024:\n- NET PROFIT for the three months period ended September 30, 2024: 498,241,621 Saudi Riyals\n- NET PROFIT for the nine months period ended September 30, 2024: 1,329,249,673 Saudi Riyals\n\n#### Profit for 2022\nWe do not have direct information about the profit for the full year 2022.\n\n#### Total Profit Calculation\nSince we do not have the profit informat

In [None]:
# --- Notebook Test Cell ---

# 1. Your original call to the function remains the same.
#    Let's also ask for a chat name to test the full logic.
print("--- Calling RAG Pipeline ---")
result_list = call_RAG(
    "how much was the profit for ELM in 2022 and 2023? add them together and return the total profit in a single number", 
    chat_name=True
)
print("--- RAG Pipeline Call Finished ---\n")


# 2. Check the status flag from the returned list
status_flag = result_list[0]

# 3. Process the result based on the flag
if status_flag == 0:
    print("--- Streaming Response (Status: Success) ---")
    
    # On success, the second element (index 1) is the GENERATOR object.
    response_generator = result_list[1]
    
    # You must loop through the generator to get the content chunks.
    # The `end=""` and `flush=True` arguments to print() will make the
    # output appear as a continuous stream in your notebook cell.
    for chunk in response_generator:
        print(chunk, end="", flush=True)
        
    # Print a final newline for clean formatting after the stream is complete.
    print("\n" + "-"*41)
    
    # You can also check if a chat name was returned as the third element.
    if len(result_list) > 2:
        chat_name = result_list[2]
        print(f"Chat Name Generated: {chat_name}")
        print("-"*41)

else: # This block handles the case where status_flag is 1
    print("--- Error Occurred ---")
    
    # On error, the second element (index 1) is the error message STRING.
    error_message = result_list[1]
    print(f"Error Message: {error_message}")
    print("-"*22)

### ELM Company Profit Calculation
To calculate the total profit for ELM in 2022 and 2023, we need to find the relevant information in the provided context.

#### Profit for 2023
The profit for the nine months period ended September 30, 2023, is mentioned in the file `Q3.txt`:
- NET PROFIT for the nine months period ended September 30, 2023: 1,029,419,971 Saudi Riyals

#### Profit for 2024 and other years
There is no information available about the profit for the full year 2022 or 2023. However, the profit for the nine months period ended September 30, 2022, is not directly available, but we have information for 2024:
- NET PROFIT for the three months period ended September 30, 2024: 498,241,621 Saudi Riyals
- NET PROFIT for the nine months period ended September 30, 2024: 1,329,249,673 Saudi Riyals

#### Profit for 2022
We do not have direct information about the profit for the full year 2022.

#### Total Profit Calculation
Since we do not have the profit information for the full year