In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
import os
from langchain_community.vectorstores import FAISS,Chroma
from langchain_core.prompts import PromptTemplate
from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
from langchain_classic.output_parsers import StructuredOutputParser,ResponseSchema
from langchain_core.messages import HumanMessage
from langchain_core.documents import Document
from langchain_core.runnables import RunnableParallel,RunnableLambda,RunnablePassthrough

In [None]:
import json
from typing import List

from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title


In [None]:
load_dotenv()

In [None]:
%pwd

In [None]:
os.chdir("../")

In [None]:
import shutil
print(shutil.which("tesseract"))

In [None]:
# %pip install "unstructured[all-docs]" pillow lxml 

In [None]:
# %pip install unstructured==0.18.21
# %pip install unstructured-client==0.42.4
# %pip install unstructured.pytesseract==0.3.15
# %pip install unstructured_inference==1.1.2

In [None]:
filepath = "D:/Cite-What-You-Type/pdfs/22-25 Clustering, K-means, DBSCAN.pdf"
output_dir ="images"

## Partition-1 : atomic elements

In [None]:
def extract_pdf_elements(filepath: str,output_dir:str) -> list:
    elements = partition_pdf(
        filename=filepath,
        strategy="hi_res",
        infer_table_structure=True,
        extract_image_block_types=['Image'],
        extract_image_block_to_payload=True,
        extract_image_block_output_dir=output_dir
    )
    print("Total elements extracted -> ",len(elements))
    return elements

elements = extract_pdf_elements(filepath,output_dir)

In [None]:
set([str(type(el))for el in elements])

In [None]:
elements[180].to_dict()

In [None]:
tables = [elem for elem in elements if elem.category == "Table"]
images = [elem for elem in elements if elem.category == "Image"]

In [None]:
import base64

In [None]:
tables[1].to_dict()

In [None]:
def get_image(output_dir, image_list):
    os.makedirs(output_dir, exist_ok=True)
    for i, img_obj in enumerate(image_list):
        bs64_str = img_obj.to_dict()['metadata']['image_base64']
        image_data = base64.b64decode(bs64_str)

        path = os.path.join(output_dir, f"image-{i}.png")

        with open(path, "wb") as f:
            f.write(image_data)

get_image(output_dir,images)

In [None]:
from IPython.display import HTML

In [None]:
def display_table(tables):
    for i,table_obj in enumerate(tables):
        display(HTML(tables[i].metadata.text_as_html))

display_table(tables)

## Chunks

In [None]:
def create_chunks_by_title(elements):

    chunks = chunk_by_title(
        elements,
        max_characters=500,
        combine_text_under_n_chars=100
    )

    print("Chunks created: ", len(chunks))
    return chunks

chunks = create_chunks_by_title(elements)

In [None]:
chunks[16].metadata.orig_elements[1].text

In [None]:
chunks[16].metadata.orig_elements[1].to_dict()

In [None]:
def seperate_content_types(chunk):

    content_data ={
        "text": chunk.text,
        "tables":[],
        "images":[],
        "types":['text']
    }

    if hasattr(chunk ,"metadata") and hasattr(chunk.metadata, "orig_elements"):
        for element in chunk.metadata.orig_elements:

            element_type = type(element).__name__

            if element_type =="Table":
                content_data['types'].append('Table')
                table_html = getattr(element.metadata , 'text_as_html', element.text)
                content_data['tables'].append(table_html)

            elif element_type == "Image":
                if hasattr(element,"metadata") and hasattr(element.metadata ,"image_base64"):
                    content_data['types'].append("Image")
                    content_data['images'].append(element.metadata.image_base64)
    
    content_data['types'] = list(set(content_data['types']))
    return content_data
                

In [None]:
def create_ai_enhanced_summary(text: str, tables: List[str], images: List[str]) -> str:
    try:
        model = ChatGoogleGenerativeAI(model='gemini-2.5-pro', temperature=0.3)

        prompt_text = f"""
        You are an AI assistant creating a searchable description for document retrieval.

        --- CONTENT TO ANALYZE ---
        
        TEXT CONTENT:
        {text}
        """

        if tables:
            prompt_text += "\nTABLES:\n"
            for i, table_obj in enumerate(tables):
                prompt_text += f"Table {i+1}:\n{table_obj}\n\n"

        prompt_text += """ 
        --- YOUR TASK ---
        Generate a comprehensive, searchable description of the content above. 
        Focus on creating metadata that will help a search engine find this document.
        
        Cover these 5 points:
        1. Key facts, exact numbers, and data points (from text and tables)
        2. Main topics and concepts discussed
        3. Questions this content could answer (e.g., "What is the revenue for Q3?")
        4. Visual Content Analysis (describe charts, diagrams, and patterns in the attached images)
        5. Alternative keywords or synonyms users might search for.

        Prioritize findability over brevity.
        
        SEARCHABLE DESCRIPTION: 
        """

        message_content = [{'type': 'text', 'text': prompt_text}]

        for image_base64 in images:
            clean_base64 = image_base64.strip()
            
            message_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{clean_base64}"}})

        message = HumanMessage(content=message_content)
        
        response = model.invoke([message])

        return response.content

    except Exception as e:
        print(f"AI Summarization Failed: {e}")
        return text

In [None]:
def summarize_chunks(chunks):

    print("...Processing Chunk ...")

    langchain_document=[]

    total_chunk = len(chunks)

    for i , chunk in enumerate(chunks):
        current_chunk = i+1
        print(f"Processed Chunk {current_chunk}/{total_chunk}")

        content_data =seperate_content_types(chunk)

        print(f'Types Found: ',content_data['types'])
        print(f"Tables: {len(content_data['tables'])} , Image: {len(content_data['images'])}")

        enhanced_cnt =""

        if content_data['tables'] or content_data['images']:
            print("Creating Summary...")
            try:
                enhanced_cnt = create_ai_enhanced_summary(content_data['text'],content_data['tables'],content_data['images'])

                if enhanced_cnt:
                    print("Successfully Summarized")
                    print(f"Preview:{enhanced_cnt[:100]}...")
                else:
                    enhanced_cnt = content_data['text']
            except Exception as e:
                print(f"AI Summary Failed: {e}")
                enhanced_cnt= content_data['text']

        else:
            print("No tables or Image Found")
            enhanced_cnt= content_data['text']

        doc = Document(
            page_content=enhanced_cnt,
            metadata={
                "original_content":json.dumps({
                    'raw_text':content_data['text'],
                    "table_html":content_data['tables'],
                    "image_base64":content_data['images']
                })
            }
        )

        langchain_document.append(doc)

    print(f"Processed {len(langchain_document)} chunks")
    return langchain_document


In [None]:
processed_chunks = summarize_chunks(chunks)

In [None]:
processed_chunks

In [None]:
def export_chunks_to_json(chunks,filename ="Chunks_exported.json"):

    export=[]

    for i , doc in enumerate(chunks):
        chunk_data = {
            "chunk_id":i+1,
            'enhanced_content': doc.page_content,
            'metadata':{
                'original_content':json.loads(doc.metadata.get("original_content","{}"))
            }
        }

        export.append(chunk_data)

    with open(filename ,'w',encoding='utf-8') as f:
        json.dump(export,f,indent=6,ensure_ascii=False)

    print(f'Exported {len(export)} chunks to {filename}')
    return export

In [None]:
json_data = export_chunks_to_json(processed_chunks)

In [None]:
def create_vector_store(documents,persist_directory="FAISS/pdfsVector"):

    embedding_model= GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

    vectorstore= FAISS.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory=persist_directory,
        collection_metadata={'hnsw:space':"cosine"}
    )

    print('finished')
    return vectorstore


In [None]:
db=create_vector_store(processed_chunks)

In [None]:
def gen_final_ans(chunks, query):
    try:
        llm = ChatGoogleGenerativeAI(model='gemini-2.5-pro', temperature=0.1)

        prompt_text = f"""
        Based on the following document context, please answer this question: {query}
        
        CONTENT_TO_ANALYZE:
        """
        all_images_base64 = []
        for i, chunk in enumerate(chunks):
            prompt_text += f"\n--- Document Fragment {i+1} ---\n"
            
            if hasattr(chunk, "metadata") and "original_content" in chunk.metadata:
                try:
                    original_data = json.loads(chunk.metadata['original_content'])
                    
                    raw_text = original_data.get("raw_text", "")
                    if raw_text:
                        prompt_text += f"Text:\n{raw_text}\n\n"

                    tables_html = original_data.get("tables_html", [])
                    if tables_html:
                        prompt_text += 'TABLES:\n'
                        for j, table in enumerate(tables_html):
                            prompt_text += f"Table {j+1}:\n{table}\n\n"
                    
                    
                    imgs = original_data.get("images_base64", [])
                    if imgs:
                        all_images_base64.extend(imgs)

                except json.JSONDecodeError:
                    prompt_text += f"Text:\n{chunk.page_content}\n\n"
            else:
                prompt_text += f"Text:\n{chunk.page_content}\n\n"

        prompt_text += """ 
        INSTRUCTIONS:
        Provide a clear, comprehensive answer using the text, tables, and images provided above. 
        If the documents don't contain sufficient information to answer the question, state: "I don't have enough information to answer the question."
        
        ANSWER:"""
    
        message_content = [{'type': 'text', 'text': prompt_text}]

        for img_b64 in all_images_base64:
            message_content.append({"type": "image_url","image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}})

        message = HumanMessage(content=message_content)
        response = llm.invoke([message])

        return response.content
        
    except Exception as e:
        print(f"Answer gen failed: {e}")
        return 'Sorry, I encountered an error generating the answer.'

In [None]:
query="what are the steps for DBSCAN Clustering"

retriver = db.as_retriever(search_kwargs={"k":3})

chunk_query=retriver.invoke(query)

final_ans = gen_final_ans(chunk_query,query)

In [None]:
print(final_ans)

In [None]:
parallel_chain = RunnableParallel({
    "query":RunnablePassthrough(),
    "context": retriver 
})

In [None]:
parallel_chain.invoke("what are the steps for DBSCAN Clustering")

In [None]:
chain = parallel_chain | RunnableLambda(lambda inputs: gen_final_ans(inputs['context'], inputs['query']))

query = "what are the steps for DBSCAN Clustering"
final_answer = chain.invoke(query)

print(final_answer)