In [None]:
#semantic-router

In [None]:
import pdfplumber

with pdfplumber.open('test.pdf') as pdf:
    pdf_text = " ".join(page.extract_text() or "" for page in pdf.pages)

In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings

In [None]:
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings = HuggingFaceEmbeddings(model_name="avsolatorio/GIST-Embedding-v0")

In [None]:
text_splitter = SemanticChunker(
    embeddings=embeddings,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=0.5,
    min_chunk_size=100
)

In [None]:
documents = text_splitter.create_documents([pdf_text])

In [None]:
print(documents)

In [None]:
from langchain.vectorstores import FAISS

In [None]:
vectorStore = FAISS.from_documents(documents, embeddings)

In [None]:
vectorStore.similarity_search("Chapter", k=20)

In [None]:
response = vectorStore.similarity_search("Chapter", k=20)

In [None]:
add_fun = lambda y:y+10
print(add_fun(5))

In [None]:
from langchain.docstore import InMemoryDocstore
from langchain.schema.output_parser import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema.runnable import RunnableMap

In [None]:
template = '''
    You are a service assitant for general public
    These are context for a question
    {context}
    Reply to them based on question :{question}
'''
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
prompt = ChatPromptTemplate.from_template(template)
output_parser = StrOutputParser()
query = "How many chapters are there?"

In [None]:
def log_input(x):
    print(x)
    return x

def parse_vector(x):
    contents = vectorStore.similarity_search(x, k=20)
    # contents = [content.page_content for content in contents]
    return contents

In [None]:
chain = RunnableMap({
    "context": lambda x:vectorStore.similarity_search(x['question'], k=20),
    "question": lambda x:x['question']
}) | prompt | log_input| llm | output_parser

In [None]:
chain.invoke({"question": query})

In [None]:
chain.invoke({"question": "List all the chapter"})


In [None]:
chain.invoke({"question": "Summarize chapter 7"})

In [None]:
import PyPDF2

with open('test.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    metadata = reader.metadata
    print(metadata)


In [None]:
vectorStoreTest = FAISS.from_texts(["Chapter", "Heading"], embeddings)

In [None]:
vectorStoreTest.similarity_search_with_relevance_scores("Chapter")

In [None]:
pdf_text

In [None]:
import pdfplumber
with pdfplumber.open("test.pdf") as pdf:
    for page in pdf.pages:
        for idx in range(len(page.chars)):
            if len(page.chars) < idx+1:
                if  page.chars[idx+1] == "":
                    print(page.chars[idx]['size'], page.chars[idx]['text'], end="")
            else:
                print(page.chars[idx]['text'], end="")

In [None]:
print("#"*25)
print("Agentic Chunking")
print("#"*25)

In [60]:
import pdfplumber
from langchain import hub
from pydantic import BaseModel, Field
from dotenv import load_dotenv
from typing import Optional, List
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnableMap
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatMessagePromptTemplate
from langchain.chains import create_extraction_chain_pydantic
from langchain.output_parsers.openai_tools import JsonOutputToolsParser

with pdfplumber.open("test.pdf") as pdf:
    text = " ".join(page.extract_text() or "" for page in pdf.pages)
        
    
load_dotenv()

True

In [None]:
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            '''Decompose the "Content" into clear, simple propositions that are interpretable without additional context.

1. Identify and extract hierarchical labels (e.g., titles, chapters, sections) from the text to infer metadata.
2. When chapter changes create new instance of Chunk
3. Split compound sentences into simple sentences while preserving the original phrasing when possible.
4. If a named entity has descriptive information, separate the description into a distinct proposition.
5. Decontextualize propositions by:
   - Adding necessary modifiers to nouns or sentences.
   - Replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the referenced entity.
6. Return the results as a JSON object with two fields:
   - **metadata**: A list of inferred hierarchical labels.
   - **sentences**: A list of decontextualized propositions.

Example:

Input:
Title: Eostre. 
Section: Theories and interpretations, Connection to Easter Hares. 
Content: The earliest evidence for the Easter Hare (Osterhase) was recorded in south-west Germany in 1678 by the professor of medicine Georg Franck von Franckenau, but it remained unknown in other parts of Germany until the 18th century. Scholar Richard Sermon writes that "hares were frequently seen in gardens in spring, and thus may have served as a convenient explanation for the origin of the colored eggs hidden there for children."

Output:

  "metadata": ["Eostre", "Theories and interpretations", "Connection to Easter Hares"], Note: when passing metadata standardize it
  "sentences": [
    "The earliest evidence for the Easter Hare was recorded in south-west Germany in 1678 by Georg Franck von Franckenau.",
    "Georg Franck von Franckenau was a professor of medicine.",
    "The evidence for the Easter Hare remained unknown in other parts of Germany until the 18th century.",
    "Richard Sermon was a scholar.",
    "Richard Sermon hypothesizes that hares frequently seen in gardens during spring may explain the origin of the colored eggs hidden there for children."
  ]


Here is the text:
'''
        ),
        ("human", "{text}"),
    ]
)



In [95]:
class Chunks(BaseModel):
    metadata: List[str] = Field(default=None,
                                description="A hierarchical list of contextual labels (e.g., titles, chapters, sections) for this specific chunk of content.")
    sentences: List[str] = Field(default=None, description="A list of simple, decontextualized sentences belonging to this chunk (e.g., a chapter or section).")
    
class Data(BaseModel):
    contents: List[Chunks] = Field(default=[], description="A collection of content chunks, where each chunk corresponds to a chapter or section.")


In [5]:
with pdfplumber.open('test.pdf') as pdf:
        pdf_text = " ".join(page.extract_text() or "" for page in pdf.pages)

In [137]:
google_llm = ChatGoogleGenerativeAI(model = "gemini-2.0-flash")
structured_google_llm = google_llm.with_structured_output(schema=Data)
chunkers = int(len(pdf_text)/5000)
prepostion = []
# start = 
prompt = prompt_template.invoke({"text": pdf_text[:20000]})
new_prepostion = structured_google_llm.invoke(prompt)

In [132]:
new_prepostion

Data(contents=[Chunks(metadata=['The Electronic Transactions Act, 2063 (2008)', 'Date of Authentication and Publication', '22 Mansir 2063 ( december 8, 2006)', 'Act number 27 of the year 2063', 'An Act promulgated for Electronic Transactions', 'Preamble'], sentences=['It is expedient to make legal provisions for authentication and regularization of the recognition, validity, integrity and reliability of generation, production, processing, storage, communication and transmission system of electronic records.', 'The legal provisions are to be made by making the transactions to be carried out by means of electronic data exchange or by any other means of electronic communications.', 'The transactions should be reliable and secured.', 'It is expedient for controlling the acts of unauthorized use of electronic records or of making alteration in such records through the illegal manner.', 'Be it enacted by the House of Representatives in the First Year of the issuance of the Proclamation of th

In [138]:
with open("test.txt", "w") as f:
    f.write(str(new_prepostion.contents))

In [93]:
pdf_text[:4000]

'The Electronic Transactions Act, 2063 (2008)\nDate of Authentication and Publication\n22 Mansir 2063 ( december 8, 2006)\nAct number 27 of the year 2063\nAn Act promulgated for Electronic Transactions\nPreamble:\nWHEREAS, it is expedient to make, legal provisions for authentication and\nregularization of the recognition, validity, integrity and reliability of generation,\nproduction, processing, storage, communication and transmission system of electronic\nrecords by making the transactions to be carried out by means of electronic data\nexchange or by any other means of electronic communications, reliable and secured;\nAnd where as, for controlling the acts of unauthorized use of electronic records or of\nmaking alteration in such records through the illegal manner,\nNow, therefore, be it enacted by the House of Representatives in the First Year of the\nissuance of the Proclamation of the House of Representatives, 2063(2007) .\nChapter - 1\nPreliminary\n1. Short Title, Extension and C

In [118]:
from langchain_core.documents import Document

documents = []

for content in new_prepostion.contents:
    temp = ""
    for sentence in content.metadata:
        # temp += sentence
        if len(content.metadata) > 0:
            print("Error")
    documents += Document(metadata=content.metadata, page_content="Place holder")

Error
Error


ValidationError: 1 validation error for Document
metadata
  Input should be a valid dictionary [type=dict_type, input_value=['The Electronic Transact...cation and Publication'], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/dict_type