In [62]:
import os
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from typing import Optional, List
from langchain.chains import create_extraction_chain_pydantic
from langchain_core.pydantic_v1 import BaseModel
from langchain import hub
from langchain.docstore.document import Document
from agentic_chunker import AgenticChunker
from cerebras.cloud.sdk import Cerebras
from langsmith import Client
import json
# from agentic_chunker import AgenticChunker
from langchain.docstore.document import Document
from dotenv import load_dotenv
from rich import print
from langchain.docstore.document import Document
from langchain_community.chat_models import ChatOllama
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import torch

# Load environment variables
load_dotenv()

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [63]:
# Initialize Cerebras client
cerebras_api_key = os.getenv("CEREBRAS_API_KEY")
if not cerebras_api_key:
    raise ValueError("CEREBRAS_API_KEY not found in environment variables")

client = Cerebras(api_key=cerebras_api_key)
model = "llama-4-scout-17b-16e-instruct"

# Function to invoke Cerebras API
def cerebras_invoke(prompt: str) -> str:
    try:
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"[Error] Cerebras API invocation failed: {e}")
        return ""

In [64]:
# Pull the proposal-indexing prompt from the hub
LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY")
obj = Client(api_key=LANGSMITH_API_KEY).pull_prompt("wfh/proposal-indexing", include_model=True)

In [65]:
class Sentences(BaseModel):
    sentences: List[str]

def get_propositions(text, prompt):
    formatted_prompt = prompt.format(input=text) + "\n\nOnly provide the list of propositions as output. Do not include any explanations, formatting, or additional text."
    # print(f"Formatted Prompt: {formatted_prompt}")
    response = cerebras_invoke(formatted_prompt)
    # print(f"Response: {response}")
    propositions = response.split('\n')
    return {"proposition": [Sentences(sentences=propositions)]}, response


In [66]:
# Load JSON data
with open("islamic_etiquette_knowledge_base.json", "r") as f1, open("Quran_app_Documentation.json", "r") as f2:
    etiquette_data = json.load(f1)
    quran_app_data = json.load(f2)

# Use only Quran app data as per the query
combined_documents = quran_app_data[:3]

# List to hold all proposition arrays with metadata
proposition_arrays = []

# Process each JSON object
for json_obj in combined_documents:
    text = json_obj['text']
    propositions, response = get_propositions(text, obj)
    
    # Create an array entry for this document's propositions
    document_propositions = {
        'metadata': {
            'url': json_obj['url'],
            'title': json_obj['title']
        },
        'propositions': [
            prop for prop in propositions['proposition'][0].sentences if prop.strip()
        ]
    }
    proposition_arrays.append(document_propositions)

# If you need a flat list of all propositions with their metadata:
flat_propositions_with_metadata = []
for doc in proposition_arrays:
    for prop in doc['propositions']:
        flat_propositions_with_metadata.append({
            'proposition': prop,
            'metadata': doc['metadata']
        })

# And if you just need a simple list of all propositions:
propositions_list = [prop for doc in proposition_arrays for prop in doc['propositions']]

In [67]:
print(len(propositions_list))

In [68]:
import os
import json
import uuid
import numpy as np
import torch
import faiss
from dotenv import load_dotenv
from typing import Optional
from rich import print
from sentence_transformers import SentenceTransformer, CrossEncoder
from rank_bm25 import BM25Okapi
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from cerebras.cloud.sdk import Cerebras

load_dotenv()
class AgenticChunker:
    def __init__(self, cerebras_api_key: Optional[str] = None):
        self.chunks = {}
        self.id_truncate_limit = 5
        self.generate_new_metadata_ind = True
        self.print_logging = True

        if cerebras_api_key is None:
            cerebras_api_key = os.getenv("CEREBRAS_API_KEY")
        if cerebras_api_key is None:
            raise ValueError("CEREBRAS_API_KEY not provided or found in environment variables")

        self.client = Cerebras(api_key=cerebras_api_key)
        self.model = "llama-4-scout-17b-16e-instruct"

    def _llm_invoke(self, prompt: str) -> str:
        try:
            response = self.client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model=self.model,
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"[Error] LLM invocation failed: {e}")
            return ""

    def add_propositions(self, propositions: list):
        for proposition in propositions:
            self.add_proposition(proposition)

    def add_proposition(self, proposition: str):
        if self.print_logging:
            print(f"\nAdding: '{proposition}'")
        if len(self.chunks) == 0:
            if self.print_logging:
                print("No chunks, creating a new one")
            self._create_new_chunk(proposition)
            return

        chunk_id = self._find_relevant_chunk(proposition)
        if chunk_id:
            if self.print_logging:
                print(f"Chunk Found ({self.chunks[chunk_id]['chunk_id']}), adding to: {self.chunks[chunk_id]['title']}")
            self.add_proposition_to_chunk(chunk_id, proposition)
        else:
            if self.print_logging:
                print("No chunks found")
            self._create_new_chunk(proposition)

    def add_proposition_to_chunk(self, chunk_id: str, proposition: str):
        self.chunks[chunk_id]['propositions'].append(proposition)
        if self.generate_new_metadata_ind:
            self.chunks[chunk_id]['summary'] = self._update_chunk_summary(self.chunks[chunk_id])
            self.chunks[chunk_id]['title'] = self._update_chunk_title(self.chunks[chunk_id])

    def _update_chunk_summary(self, chunk: dict) -> str:
        prompt = (
            "You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic.\n"
            "A new proposition was just added to one of your chunks. Generate a very brief 1-sentence summary which will inform viewers what a chunk group is about.\n"
            "A good summary will say what the chunk is about, and give any clarifying instructions on what to add to the chunk.\n"
            "Your summaries should anticipate generalization. If you get a proposition about apples, generalize it to food. Or month, generalize it to 'date and times'.\n"
            "Example:\n"
            "Input: Proposition: Greg likes to eat pizza\n"
            "Output: This chunk contains information about the types of food Greg likes to eat.\n"
            "Only respond with the chunk new summary, nothing else.\n"
            f"Chunk's propositions:\n" + "\n".join(chunk['propositions']) +
            f"\n\nCurrent chunk summary:\n{chunk['summary']}"
        )
        return self._llm_invoke(prompt)

    def _update_chunk_title(self, chunk: dict) -> str:
        prompt = (
            "You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic.\n"
            "A new proposition was just added to one of your chunks, you should generate a very brief updated chunk title which will inform viewers what a chunk group is about.\n"
            "A good title will say what the chunk is about.\n"
            "You will be given a group of propositions which are in the chunk, chunk summary and the chunk title.\n"
            "Your title should anticipate generalization. If you get a proposition about apples, generalize it to food. Or month, generalize it to \"date and times\".\n"
            "Example:\n"
            "Input: Summary: This chunk is about dates and times that the author talks about\n"
            "Output: Date & Times\n"
            "Only respond with the new chunk title, nothing else.\n"
            f"Chunk's propositions:\n" + "\n".join(chunk['propositions']) +
            f"\n\nChunk summary:\n{chunk['summary']}\n\nCurrent chunk title:\n{chunk['title']}"
        )
        return self._llm_invoke(prompt)

    def _get_new_chunk_summary(self, proposition: str) -> str:
        prompt = (
            "You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic.\n"
            "You should generate a very brief 1-sentence summary which will inform viewers what a chunk group is about.\n"
            "A good summary will say what the chunk is about, and give any clarifying instructions on what to add to the chunk.\n"
            "You will be given a proposition which will go into a new chunk. This new chunk needs a summary.\n"
            "Your summaries should anticipate generalization. If you get a proposition about apples, generalize it to food. Or month, generalize it to \"date and times\".\n"
            "Example:\n"
            "Input: Proposition: Greg likes to eat pizza\n"
            "Output: This chunk contains information about the types of food Greg likes to eat.\n"
            "Only respond with the new chunk summary, nothing else.\n"
            f"Determine the summary of the new chunk that this proposition will go into:\n{proposition}"
        )
        return self._llm_invoke(prompt)

    def _get_new_chunk_title(self, summary: str) -> str:
        prompt = (
            "You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic.\n"
            "You should generate a very brief few word chunk title which will inform viewers what a chunk group is about.\n"
            "A good chunk title is brief but encompasses what the chunk is about.\n"
            "You will be given a summary of a chunk which needs a title.\n"
            "Your titles should anticipate generalization. If you get a proposition about apples, generalize it to food. Or month, generalize it to \"date and times\".\n"
            "Example:\n"
            "Input: Summary: This chunk is about dates and times that the author talks about\n"
            "Output: Date & Times\n"
            "Only respond with the new chunk title, nothing else.\n"
            f"Determine the title of the chunk that this summary belongs to:\n{summary}"
        )
        return self._llm_invoke(prompt)

    def _create_new_chunk(self, proposition: str):
        new_chunk_id = str(uuid.uuid4())[:self.id_truncate_limit]
        new_chunk_summary = self._get_new_chunk_summary(proposition)
        new_chunk_title = self._get_new_chunk_title(new_chunk_summary)
        self.chunks[new_chunk_id] = {
            'chunk_id': new_chunk_id,
            'propositions': [proposition],
            'title': new_chunk_title,
            'summary': new_chunk_summary,
            'chunk_index': len(self.chunks)
        }
        if self.print_logging:
            print(f"Created new chunk ({new_chunk_id}): {new_chunk_title}")

    def get_chunk_outline(self) -> str:
        chunk_outline = ""
        for chunk_id, chunk in self.chunks.items():
            single_chunk_string = f"""Chunk ({chunk['chunk_id']}): {chunk['title']}\nSummary: {chunk['summary']}\n\n"""
            chunk_outline += single_chunk_string
        return chunk_outline

    def _find_relevant_chunk(self, proposition: str) -> Optional[str]:
        current_chunk_outline = self.get_chunk_outline()
        prompt = (
            "Determine whether or not the 'Proposition' should belong to any of the existing chunks.\n"
            "A proposition should belong to a chunk if their meaning, direction, or intention are similar.\n"
            "The goal is to group similar propositions and chunks.\n"
            "If you think a proposition should be joined with a chunk, return the chunk id.\n"
            "If you do not think an item should be joined with an existing chunk, just return 'No chunks'.\n"
            "Example:\n"
            "Input:\n"
            "    - Proposition: 'Greg really likes hamburgers'\n"
            "    - Current Chunks:\n"
            "        - Chunk ID: 2n4l3d\n"
            "        - Chunk Name: Places in San Francisco\n"
            "        - Chunk Summary: Overview of the things to do with San Francisco Places\n"
            "        - Chunk ID: 93833k\n"
            "        - Chunk Name: Food Greg likes\n"
            "        - Chunk Summary: Lists of the food and dishes that Greg likes\n"
            "Output: 93833k\n"
            f"Current Chunks:\n--Start of current chunks--\n{current_chunk_outline}\n--End of current chunks--\n"
            f"Determine if the following statement should belong to one of the chunks outlined:\n{proposition}"
            f"Do not write anything else. Only return the chunk id if you think it should belong to a chunk, or 'No chunks relevant to the proposition' if it should not.\n"
        )
        print(f"\n[DEBUG] LLM Prompt:\n{prompt}\n")
        chunk_found = self._llm_invoke(prompt).strip()
        print(f"[DEBUG] Chunk Found: {chunk_found}")
        if len(chunk_found) == self.id_truncate_limit and chunk_found in self.chunks:
            return chunk_found
        return None

    def get_chunks(self, get_type: str = 'dict') -> list:
        if get_type == 'dict':
            return self.chunks
        if get_type == 'list_of_strings':
            return [" ".join(chunk['propositions']) for chunk in self.chunks.values()]

    def pretty_print_chunks(self):
        print(f"\nYou have {len(self.chunks)} chunks\n")
        for chunk_id, chunk in self.chunks.items():
            print(f"Chunk #{chunk['chunk_index']}")
            print(f"Chunk ID: {chunk_id}")
            print(f"Summary: {chunk['summary']}")
            print(f"Propositions:")
            for prop in chunk['propositions']:
                print(f"    - {prop}")
            print("\n")

    def pretty_print_chunk_outline(self):
        print("Chunk Outline\n")
        print(self.get_chunk_outline())

In [69]:
# Initialize AgenticChunker and add propositions
ac = AgenticChunker()
ac.add_propositions(propositions_list)

In [70]:
# Get chunks as a dictionary
chunks_dict = ac.get_chunks(get_type='dict')
print(len(chunks_dict))
print(chunks_dict)
# print(ac.pretty_print_chunks())
# chunks = ac.get_chunks(get_type='list_of_strings')
# print(chunks)
# documents = [Document(page_content=chunk, metadata={"source": "local"}) for chunk in chunks]

In [71]:
print(chunks_dict)

In [72]:
import re

# Create a list of Document objects from chunk_dict for embedding
documents = [
    Document(
        page_content=re.sub(r'^[\[\]"\',\s]+|[\[\]"\',\s]+$', '', " ".join(chunk['propositions'])),
        metadata=chunk['metadata'] if 'metadata' in chunk else {
            "chunk_id": chunk['chunk_id'],
            "title": chunk['title'],
            "summary": chunk['summary'],
            "chunk_index": chunk['chunk_index']
        }
    )
    for chunk in chunks_dict.values()
]

print(documents)

Embedding

In [73]:
# Index construction
def build_index(documents: list):
    all_chunks = [doc.page_content for doc in documents]
    metadata = [doc.metadata for doc in documents]

    # Sparse (BM25)
    tokenized_corpus = [chunk.split(" ") for chunk in all_chunks]
    bm25 = BM25Okapi(tokenized_corpus)

    # Dense Embeddings
    embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)
    dense_embeddings = embedding_model.encode(all_chunks, convert_to_numpy=True, normalize_embeddings=True)
    dim = dense_embeddings.shape[1]

    # FAISS (HNSW)
    index = faiss.IndexHNSWFlat(dim, 32)
    index.hnsw.efConstruction = 40
    faiss.normalize_L2(dense_embeddings)
    index.add(dense_embeddings)

    # Chroma
    chroma_db = Chroma.from_texts(
        texts=all_chunks,
        embedding=HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'),
        metadatas=metadata,
        persist_directory="./chroma_agentic"
    )

    return bm25, index, dense_embeddings, all_chunks, metadata, chroma_db

In [74]:
# Hybrid retrieval with Chroma
def retrieve_context(query: str, bm25, faiss_index, corpus: List[str], metadata: List[dict], chroma_db, top_k: int = 50, rerank_k: int = 15) -> tuple:
    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=device)
    embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)

    # BM25 retrieval
    tokenized_query = query.split(" ")
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_top_idx = np.argsort(bm25_scores)[::-1][:top_k]

    # FAISS retrieval
    query_emb = embedding_model.encode(query, convert_to_numpy=True, normalize_embeddings=True)
    faiss.normalize_L2(query_emb.reshape(1, -1))
    _, dense_top_idx = faiss_index.search(query_emb.reshape(1, -1), top_k)

    # Chroma retrieval
    chroma_results = chroma_db.similarity_search_with_score(query, k=top_k)
    chroma_top_idx = [corpus.index(doc.page_content) for doc, _ in chroma_results if doc.page_content in corpus]

    # Combine and deduplicate
    candidate_indices = set(bm25_top_idx) | set(dense_top_idx[0]) | set(chroma_top_idx)
    candidates = [(i, corpus[i], metadata[i]) for i in candidate_indices if i < len(corpus)]

    # Rerank
    pairs = [[query, chunk] for _, chunk, _ in candidates]
    scores = cross_encoder.predict(pairs)
    reranked = sorted(zip(scores, candidates), key=lambda x: x[0], reverse=True)[:rerank_k]

    contexts = [chunk for _, (_, chunk, _) in reranked]
    docs = [meta for _, (_, _, meta) in reranked]

    # print all the results of the retrieval techniques
    print(f"BM25 Top Indices: {bm25_top_idx}")
    print(f"FAISS Top Indices: {dense_top_idx[0]}")
    print(f"Chroma Top Indices: {chroma_top_idx}")
    print(f"Combined Candidate Indices: {candidate_indices}")
    print(f"Reranked Contexts: {contexts}")
    return contexts, docs


# # Hybrid retrieval
# # def retrieve_context(query: str, bm25, faiss_index, corpus: list, metadata: list, top_k: int = 50, rerank_k: int = 10) -> tuple:
# #     cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=device)
# #     embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)

# #     tokenized_query = query.split(" ")
# #     bm25_scores = bm25.get_scores(tokenized_query)
# #     bm25_top_idx = np.argsort(bm25_scores)[::-1][:top_k]

# #     query_emb = embedding_model.encode(query, convert_to_numpy=True, normalize_embeddings=True)
# #     faiss.normalize_L2(query_emb.reshape(1, -1))
# #     _, dense_top_idx = faiss_index.search(query_emb.reshape(1, -1), top_k)

# #     candidate_indices = set(bm25_top_idx) | set(dense_top_idx[0])
# #     candidates = [(i, corpus[i], metadata[i]) for i in candidate_indices]

# #     pairs = [[query, chunk] for _, chunk, _ in candidates]
# #     scores = cross_encoder.predict(pairs)
# #     reranked = sorted(zip(scores, candidates), key=lambda x: x[0], reverse=True)[:rerank_k]

# #     contexts = [chunk for _, (_, chunk, _) in reranked]
# #     docs = [meta for _, (_, _, meta) in reranked]
# #     return contexts, docs


In [75]:
def generate_response(query: str, bm25, faiss_index, corpus: list, metadata: list, chroma_db) -> str:
    contexts, docs = retrieve_context(query, bm25, faiss_index, corpus, metadata, chroma_db)
    combined_context = "\n\n".join([f"{doc.get('source', doc.get('title', ''))}:\n{ctx}" for ctx, doc in zip(contexts, docs)])
    prompt = f"Retrieved Chunks:\n{combined_context}\n\nQuery: {query}"
    return prompt


In [76]:
# Build index
bm25, index, dense_embeddings, all_chunks, metadata, chroma_db = build_index(documents)

In [77]:
# Run generation
review_text = "Quran app audio is not working properly. I can't understand how to use the audio feature clearly. Also searching is not working"
response = generate_response(review_text, bm25, index, all_chunks, metadata, chroma_db)

# print with wraptext
print("Generated Response:\n", response)