In [149]:
import os
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from typing import Optional, List
from langchain.chains import create_extraction_chain_pydantic
from langchain_core.pydantic_v1 import BaseModel
from langchain import hub
from langchain.docstore.document import Document
from agentic_chunker import AgenticChunker
from cerebras.cloud.sdk import Cerebras
from langsmith import Client
import json
from agentic_chunker import AgenticChunker
from langchain.docstore.document import Document
from dotenv import load_dotenv
from rich import print
from langchain.docstore.document import Document
from langchain_community.chat_models import ChatOllama
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

# Load environment variables
load_dotenv()

True

In [150]:
# Initialize Cerebras client
cerebras_api_key = os.getenv("CEREBRAS_API_KEY")
if not cerebras_api_key:
    raise ValueError("CEREBRAS_API_KEY not found in environment variables")

client = Cerebras(api_key=cerebras_api_key)
model = "llama-4-scout-17b-16e-instruct"

# Function to invoke Cerebras API
def cerebras_invoke(prompt: str) -> str:
    try:
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"[Error] Cerebras API invocation failed: {e}")
        return ""

In [151]:
# Pull the proposal-indexing prompt from the hub
LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY")
obj = Client(api_key=LANGSMITH_API_KEY).pull_prompt("wfh/proposal-indexing", include_model=True)

In [152]:
class Sentences(BaseModel):
    sentences: List[str]

def get_propositions(text, prompt):
    formatted_prompt = prompt.format(input=text) + "\n\nOnly provide the list of propositions as output. Do not include any explanations, formatting, or additional text."
    # print(f"Formatted Prompt: {formatted_prompt}")
    response = cerebras_invoke(formatted_prompt)
    # print(f"Response: {response}")
    propositions = response.split('\n')
    return {"proposition": [Sentences(sentences=propositions)]}, response


In [None]:
# Load JSON data
with open("islamic_etiquette_knowledge_base.json", "r") as f1, open("Quran_app_Documentation.json", "r") as f2:
    etiquette_data = json.load(f1)
    quran_app_data = json.load(f2)

# Use only Quran app data as per the query
combined_documents = quran_app_data[:2]

# List to hold all proposition arrays with metadata
proposition_arrays = []

# Process each JSON object
for json_obj in combined_documents:
    text = json_obj['text']
    propositions, response = get_propositions(text, obj)
    
    # Create an array entry for this document's propositions
    document_propositions = {
        'metadata': {
            'url': json_obj['url'],
            'title': json_obj['title']
        },
        'propositions': [
            prop for prop in propositions['proposition'][0].sentences if prop.strip()
        ]
    }
    proposition_arrays.append(document_propositions)

# If you need a flat list of all propositions with their metadata:
flat_propositions_with_metadata = []
for doc in proposition_arrays:
    for prop in doc['propositions']:
        flat_propositions_with_metadata.append({
            'proposition': prop,
            'metadata': doc['metadata']
        })

# And if you just need a simple list of all propositions:
propositions_list = [prop for doc in proposition_arrays for prop in doc['propositions']]

In [164]:
print(proposition_arrays[0]["metadata"])

In [None]:
# Initialize AgenticChunker and add propositions
ac = AgenticChunker()
ac.add_propositions(propositions_list)

# Get chunks as a dictionary
chunks_dict = ac.get_chunks(get_type='dict')

# Create Document objects for each chunk
documents = []
for chunk in chunks_dict.values():
    chunk_propositions = chunk['propositions']
    # Collect unique sources for this chunk
    unique_sources = set()
    for prop in chunk_propositions:
        metadata = prop_to_metadata[prop]
        unique_sources.add((metadata['url'], metadata['title']))
    # Convert unique sources to a list of dictionaries
    sources = [{'url': url, 'title': title} for url, title in unique_sources]
    # Join propositions into a single string for the document content
    chunk_content = " ".join(chunk_propositions)
    # Create metadata with chunk details and sources
    metadata = {
        'chunk_title': chunk['title'],
        'chunk_summary': chunk['summary'],
        'sources': sources
    }
    # Create and append the Document object
    doc = Document(page_content=chunk_content, metadata=metadata)
    documents.append(doc)

In [3]:
# # Agentic Chunking
# print("#### Proposition-Based Chunking ####")

# # https://arxiv.org/pdf/2312.06648.pdf
# import os
# from langchain.output_parsers.openai_tools import JsonOutputToolsParser
# from langchain_openai import ChatOpenAI
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_core.runnables import RunnableLambda
# from langchain.chains import create_extraction_chain
# from typing import Optional, List
# from langchain.chains import create_extraction_chain_pydantic
# from pydantic import BaseModel
# from langchain import hub
# from langsmith import Client
# from cerebras.cloud.sdk import Cerebras
# from agentic_chunker import AgenticChunker
# import os
# import json
# import uuid
# import numpy as np
# import torch
# import faiss
# from dotenv import load_dotenv
# from typing import Optional
# from rich import print
# from sentence_transformers import SentenceTransformer, CrossEncoder
# from rank_bm25 import BM25Okapi
# from langchain_community.vectorstores import Chroma
# from langchain_community.embeddings import HuggingFaceEmbeddings
# from langchain.docstore.document import Document
# from cerebras.cloud.sdk import Cerebras

# # Load environment variables
# load_dotenv()

# # Initialize device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")


In [None]:
# LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY")
# client = Client(api_key=LANGSMITH_API_KEY)
# prompt = client.pull_prompt("wfh/proposal-indexing", include_model=True)

In [None]:
# #  Proposition extraction (simplified from Retriver.ipynb)
# def extract_propositions(text: str, chunker: AgenticChunker, prompt: ChatPromptTemplate) -> list:
#     formatted_prompt = prompt.format(input=text)
#     response = chunker._llm_invoke(formatted_prompt)
#     propositions = [line.strip() for line in response.split("\n") if line.strip()]
#     return propositions


In [None]:
# # Agentic chunking with metadata
# def perform_agentic_chunking_with_metadata(documents_json: list, prompt: ChatPromptTemplate) -> list:
#     ac = AgenticChunker()
#     chunked_docs = []

#     for doc in documents_json:
#         text = doc.get("text", "")
#         title = doc.get("title", "Unknown Title")
#         url = doc.get("url", "Unknown URL")

#         try:
#             propositions = extract_propositions(text, ac, prompt)
#             ac.add_propositions(propositions)

#             agentic_chunks = ac.get_chunks(get_type="list_of_strings")
#             for chunk_text in agentic_chunks:
#                 chunked_docs.append(Document(
#                     page_content=chunk_text,
#                     metadata={
#                         "title": title,
#                         "url": url,
#                         "source": "agentic"
#                     }
#                 ))

#             ac = AgenticChunker()  # Reset after each document

#         except Exception as e:
#             print(f"[Warning] Skipped document: {title}, error: {e}")
#             continue

#     return chunked_docs

In [None]:
# with open("islamic_etiquette_knowledge_base.json", "r") as f1, open("Quran_app_Documentation.json", "r") as f2:
#     etiquette_data = json.load(f1)
#     quran_app_data = json.load(f2)

# # combined_documents = etiquette_data + quran_app_data
# combined_documents = quran_app_data

# # Perform agentic chunking
# docs = perform_agentic_chunking_with_metadata(combined_documents, prompt)
# print(docs[:2])

# # if not docs:
# #     raise ValueError("No chunked documents found. Please check your chunking process and input data.")

# # # Build indices
# # bm25, faiss_index, embeddings, corpus, metadata, chroma = build_index(docs)

# # # Example query
# # query = "The Quran app is good but not helpful in understanding the Quranic verses."
# # prompt = generate_response(query, bm25, faiss_index, corpus, metadata, chroma)
# # print("\nGenerated Prompt:\n")
# # print(prompt)

In [None]:
# # Index construction
# def build_index(documents: list):
#     all_chunks = [doc.page_content for doc in documents]
#     metadata = [doc.metadata for doc in documents]

#     # Sparse (BM25)
#     tokenized_corpus = [chunk.split(" ") for chunk in all_chunks]
#     bm25 = BM25Okapi(tokenized_corpus)

#     # Dense Embeddings
#     embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)
#     dense_embeddings = embedding_model.encode(all_chunks, convert_to_numpy=True, normalize_embeddings=True)
#     dim = dense_embeddings.shape[1]

#     # FAISS (HNSW)
#     index = faiss.IndexHNSWFlat(dim, 32)
#     index.hnsw.efConstruction = 40
#     faiss.normalize_L2(dense_embeddings)
#     index.add(dense_embeddings)

#     # Chroma
#     chroma_db = Chroma.from_texts(
#         texts=all_chunks,
#         embedding=HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'),
#         metadatas=metadata,
#         persist_directory="./chroma_agentic"
#     )

#     return bm25, index, dense_embeddings, all_chunks, metadata, chroma_db

In [None]:
# # Hybrid retrieval
# # def retrieve_context(query: str, bm25, faiss_index, corpus: list, metadata: list, top_k: int = 50, rerank_k: int = 10) -> tuple:
# #     cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=device)
# #     embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)

# #     tokenized_query = query.split(" ")
# #     bm25_scores = bm25.get_scores(tokenized_query)
# #     bm25_top_idx = np.argsort(bm25_scores)[::-1][:top_k]

# #     query_emb = embedding_model.encode(query, convert_to_numpy=True, normalize_embeddings=True)
# #     faiss.normalize_L2(query_emb.reshape(1, -1))
# #     _, dense_top_idx = faiss_index.search(query_emb.reshape(1, -1), top_k)

# #     candidate_indices = set(bm25_top_idx) | set(dense_top_idx[0])
# #     candidates = [(i, corpus[i], metadata[i]) for i in candidate_indices]

# #     pairs = [[query, chunk] for _, chunk, _ in candidates]
# #     scores = cross_encoder.predict(pairs)
# #     reranked = sorted(zip(scores, candidates), key=lambda x: x[0], reverse=True)[:rerank_k]

# #     contexts = [chunk for _, (_, chunk, _) in reranked]
# #     docs = [meta for _, (_, _, meta) in reranked]
# #     return contexts, docs

# # Hybrid retrieval with Chroma
# def retrieve_context(query: str, bm25, faiss_index, corpus: List[str], metadata: List[dict], chroma_db, top_k: int = 50, rerank_k: int = 15) -> tuple:
#     cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=device)
#     embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)

#     # BM25 retrieval
#     tokenized_query = query.split(" ")
#     bm25_scores = bm25.get_scores(tokenized_query)
#     bm25_top_idx = np.argsort(bm25_scores)[::-1][:top_k]

#     # FAISS retrieval
#     query_emb = embedding_model.encode(query, convert_to_numpy=True, normalize_embeddings=True)
#     faiss.normalize_L2(query_emb.reshape(1, -1))
#     _, dense_top_idx = faiss_index.search(query_emb.reshape(1, -1), top_k)

#     # Chroma retrieval
#     chroma_results = chroma_db.similarity_search_with_score(query, k=top_k)
#     chroma_top_idx = [corpus.index(doc.page_content) for doc, _ in chroma_results if doc.page_content in corpus]

#     # Combine and deduplicate
#     candidate_indices = set(bm25_top_idx) | set(dense_top_idx[0]) | set(chroma_top_idx)
#     candidates = [(i, corpus[i], metadata[i]) for i in candidate_indices if i < len(corpus)]

#     # Rerank
#     pairs = [[query, chunk] for _, chunk, _ in candidates]
#     scores = cross_encoder.predict(pairs)
#     reranked = sorted(zip(scores, candidates), key=lambda x: x[0], reverse=True)[:rerank_k]

#     contexts = [chunk for _, (_, chunk, _) in reranked]
#     docs = [meta for _, (_, _, meta) in reranked]
#     return contexts, docs

# # Generate response (prompt only)
# def generate_response(query: str, bm25, faiss_index, corpus: list, metadata: list) -> str:
#     contexts, docs = retrieve_context(query, bm25, faiss_index, corpus, metadata)
#     combined_context = "\n\n".join([f"{doc['source']}:\n{ctx}" for ctx, doc in zip(contexts, docs)])
#     prompt = f"Retrieved Chunks:\n{combined_context}\n\nQuery: {query}"
#     return prompt


In [None]:
# for i, doc in enumerate(docs):
#     print(f"--- Document {i} ---")
#     print(f"Content: {doc.page_content}")
#     print(f"Metadata: {doc.metadata}")
#     print()