In [None]:
import os
os.chdir("../")
%pwd

In [None]:
import base64
from io import BytesIO
from PIL import Image

In [None]:
file_path="pdfs/22-25 Clustering, K-means, DBSCAN.pdf"

In [None]:
from pinecone import Pinecone
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings
from pinecone_text.sparse import BM25Encoder
from langchain_community.retrievers import PineconeHybridSearchRetriever
from langchain_core.documents import Document
from Multi_Modal.SeperationAndSummarization import summarize_chunks
from langchain_community.document_loaders import UnstructuredWordDocumentLoader,UnstructuredExcelLoader,UnstructuredPowerPointLoader
from unstructured.partition.docx import partition_docx
from unstructured.partition.pptx import partition_pptx
from unstructured.partition.pdf import partition_pdf
from  unstructured.chunking.title import chunk_by_title
from IPython.display import HTML

In [None]:
load_dotenv()

In [None]:
def extract_pdf_elements(filepath: str) -> list:
    elements = partition_pdf(
        filename=filepath,
        strategy="hi_res",
        infer_table_structure=True,
        extract_image_block_types=['Image'],
        extract_image_block_to_payload=True,
    )
    print("Total elements extracted -> ",len(elements))
    return elements


In [None]:
def create_chunks_by_title(elements):
    chunks = chunk_by_title(
        elements,
        max_characters=1500,
        combine_text_under_n_chars=200
    )

    print("Chunks created: ", len(chunks))
    return chunks

In [None]:
def get_chunks(filepath:str):
    elements = extract_pdf_elements(filepath)
    chunk = create_chunks_by_title(elements)

    return chunk

In [None]:
chunk = get_chunks(file_path)

In [None]:
summarized_chunk = summarize_chunks(chunk)

In [None]:
text1=chunk[10].to_dict()
text1

In [None]:
for elem in chunk[10].metadata.orig_elements:
    print(type(elem).__name__)


In [None]:
def seperate_content_types(chunk):

    content_data ={
        "text": chunk.text,
        "tables":[],
        "images":[],
        "types":['text']
    }

    if hasattr(chunk ,"metadata") and hasattr(chunk.metadata, "orig_elements"):
        for element in chunk.metadata.orig_elements:

            element_type = type(element).__name__

            if element_type =="Table":
                content_data['types'].append('Table')
                table_html = getattr(element.metadata , 'text_as_html', element.text)
                content_data['tables'].append(table_html)

            elif element_type == "Image":
                if hasattr(element,"metadata") and hasattr(element.metadata ,"image_base64"):
                    content_data['types'].append("Image")
                    content_data['images'].append(element.metadata.image_base64)
    
    content_data['types'] = list(set(content_data['types']))
    return content_data

In [None]:
sep=seperate_content_types(chunk[10])
sep

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

In [None]:
model.eval()

In [None]:
def base_64_to_pil_image(image):
    img_bytes = base64.b64decode(image)
    img_buffer = BytesIO(img_bytes)
    img =Image.open(img_buffer).convert("RGB")
    return img


In [None]:
def caption_from_element(element:list) ->list[str]:
    text=[]
    for image in element:
        pil_img = base_64_to_pil_image(image)
        inputs=processor(images=pil_img, return_tensors="pt")
        out = model.generate(**inputs, max_new_tokens=64)

        text.append(processor.decode(out[0],skip_sepecial_token=True))
    return text


In [None]:
caption_from_element(sep["images"])

In [None]:
summarized_chunk

In [None]:
bm25 = BM25Encoder.default()

In [None]:
pc=Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))

In [None]:
pc.list_indexes().names()

In [None]:
index_name = "hybrid-demo"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="dotproduct", 
        spec={"serverless": {"cloud": "gcp", "region": "us-central1"}}
    )

In [None]:
hasattr(summarized_chunk[0],"page_content")

In [None]:
summarized_chunk[0].page_content

In [None]:
def get_text(document) ->list:
    text=[]
    for doc in document:
        if hasattr(doc,"page_content"):
            text.append(doc.page_content)
    return text
            

In [None]:
text_list = get_text(summarized_chunk)

In [None]:
index=pc.Index(index_name)

In [None]:
emb_model=GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

In [None]:
bm25.fit(text_list)

In [None]:
bm25.dump("bm25_enc.json")

In [None]:
retreiver = PineconeHybridSearchRetriever(embeddings=emb_model,sparse_encoder=bm25,index=index)

In [None]:
values=bm25.encode_documents(text_list[0])['values']

In [None]:
len(values)

In [None]:
len(emb_model.embed_query(text_list[0]))

In [None]:
batch_size = 2
vectors_to_upsert = []

for i, text in enumerate(text_list):
    vector = {
        "id": f"doc{i}",
        "values": emb_model.embed_query(text),
        "sparse_values": bm25.encode_documents(text),
        "metadata": {"text": text}
    }
    vectors_to_upsert.append(vector)
    
    if len(vectors_to_upsert) >= batch_size:
        index.upsert(vectors_to_upsert)
        vectors_to_upsert = []

if vectors_to_upsert:
    index.upsert(vectors_to_upsert)


In [None]:
retreiver = PineconeHybridSearchRetriever(embeddings=emb_model,sparse_encoder=bm25,index=index,text_key="text" )

In [None]:
retreiver.invoke("member invoved in the project")

In [None]:
dense_index_name="dense-for-hybrid-search"
sparse_index_name="sparse-for-hybrid-search"

In [None]:
import json
import uuid
from typing import List, Tuple
from google.cloud import storage
from langchain_core.documents import Document
from langchain_pinecone import PineconeVectorStore, PineconeSparseVectorStore
from pinecone import ServerlessSpec
from pinecone_text.sparse import BM25Encoder

In [None]:
PROJECT_ID = os.environ['PROJECT_ID']
REGION = os.environ['REGION']
BUCKET_NAME=os.environ['BUCKET_NAME']
INDEX_DISPLAY_NAME=os.environ['INDEX_DISPLAY_NAME']

In [None]:
summarized_chunk[0].page_content

In [None]:
summarized_chunk[0].metadata

In [None]:
isinstance(summarized_chunk[0].metadata,dict)

In [None]:
k=1

In [None]:
k

In [None]:
def upload_files(
    document: List[Document],
    index_name: str,
    bucket_name: str,
    project_id: str
) -> Tuple[BM25Encoder, PineconeHybridSearchRetriever]:
   
    global k
    try:
        client = storage.Client(project=project_id)
        bucket = client.bucket(bucket_name)
        print("Logging Completed in Gcloud")
    except Exception as e:
        print(f"Warning: Could not initialize GCS client: {e}")
        raise e


    valid_texts = []
    valid_docs_original = []

    for doc in document:
        try:
            content = ""
            if hasattr(doc, "page_content") and doc.page_content:
                content = doc.page_content
            elif hasattr(doc, "metadata") and "original_content" in doc.metadata:
                content = doc.metadata['original_content']
            
            if content:
                valid_texts.append(content)
                valid_docs_original.append(doc)
            else:
                print(f"Skipping document with no content: {doc.metadata.get('source', 'unknown')}")
                
        except Exception as e:
            print(f"Error extracting text from doc: {e}")

    print(f"Extracted {len(valid_texts)} valid texts")

    if not valid_texts:
        raise ValueError("No valid texts extracted from documents.")


    emb_model = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")
    
    try:
        sample_vec = emb_model.embed_query(valid_texts[0])
        dim = len(sample_vec)
        print("Created Dense Vectors, Dimension of each vector is: ", dim)
    except Exception as e:
        raise ValueError(f"Failed to generate valid embedding: {e}")

    
    bm25 = BM25Encoder.default()
    bm25.fit(valid_texts)


    print("Initializing Pinecone...")
    pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))
    existing_indexes = [i["name"] for i in pc.list_indexes()]
    print(index_name)
    if index_name not in existing_indexes:
        print(f"Creating new Pinecone index: {index_name}")
        pc.create_index(
            name=index_name,
            dimension=dim,
            metric="dotproduct",
            spec=ServerlessSpec(cloud="gcp", region="us-central1")
        )
    else:
        print(f"Using existing Pinecone index: {index_name}")

    index = pc.Index(index_name)

    print("Processing documents and upserting to Pinecone...")
    
    vectors_to_upsert = []
    batch_size = 2

    for i, doc in enumerate(valid_docs_original):
        doc_id = uuid.uuid4().hex[:15]
        try:

            metadata_dict = doc.metadata if isinstance(doc.metadata, dict) else {}
            orig_cnt = json.loads(metadata_dict['original_content'])
            
            bucket_content = {
                "id": doc_id,
                "raw_text": orig_cnt.get('raw_text', ""),
                "summ_text": valid_texts[i],
                'table_as_html':orig_cnt.get('table_as_html', {}),
                'image_base64':orig_cnt.get('image_base64', {})
            }
            meta_blob = bucket.blob(f"docstore_{k}/{doc_id}.json")
            meta_blob.upload_from_string(json.dumps(bucket_content))

            dense_vector = emb_model.embed_query(valid_texts[i])
            sparse_vector = bm25.encode_documents(valid_texts[i])
            
            vector = {
                "id": doc_id,
                "values": dense_vector,
                "sparse_values": sparse_vector,
                "metadata": {
                    "text": valid_texts[i], 
                    "gcs_uri": f"gs://{bucket_name}/docstore_{k}/{doc_id}.json"
    
                }
            }
            vectors_to_upsert.append(vector)

    
            if len(vectors_to_upsert) >= batch_size:
                index.upsert(vectors=vectors_to_upsert)
                vectors_to_upsert = []
                print(f"Upserted batch ending at {i}")

        except Exception as e:
            print(f"Error processing doc {i}: {e}")

            continue

    retriever = PineconeHybridSearchRetriever(
        embeddings=emb_model,
        sparse_encoder=bm25,
        index=index,
        text_key="text",
        namespace="my-namespace"
    )
    
    k=k+1
    return bm25, retriever


In [None]:
index_name = "user-"+str(uuid.uuid4().hex[:15])

In [None]:
index_name

In [None]:
res=upload_files(summarized_chunk,index_name,BUCKET_NAME,PROJECT_ID)

In [None]:
bm25,retriver = res

In [None]:
retriver.alpha =0.6
retriver.top_k=8
retriver.namespace="my-namespace"

In [None]:
retriver.invoke("what is the slides about")

In [None]:
pc1=Pinecone()

In [None]:
index_name

In [None]:
index_obj=pc1.Index(index_name)

In [None]:
emb_model = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

In [None]:
query="what is the slides about"


In [None]:
sparse_vec=bm25.encode_queries(query)
dense_vec=emb_model.embed_query(query)

In [None]:
results=index_obj.query(
    vector=dense_vec,
    sparse_vector=sparse_vec,
    top_k=10,
    include_metadata=True
)

In [None]:
results

In [None]:
doc_rerank=[]
for rdoc in results.matches:
    doc={}
    doc['text']=rdoc.metadata['text']
    gcs_uri=rdoc.metadata['gcs_uri']
    id=rdoc.id

    doc['metadata']=[id,gcs_uri]

    doc_rerank.append(doc)


In [None]:
doc_rerank[0]['text']

In [None]:
reranked=pc1.inference.rerank(
    model="cohere-rerank-3.5",
    query=query,
    documents=doc_rerank,
    top_n=5,
    rank_fields=['text'],
    return_documents=True
)

In [None]:
reranked.data

In [None]:
final_docs=[dt['document'] for dt in reranked.data]

In [None]:
for doc in final_docs:
    print(doc)

In [None]:
hasattr(final_docs[0],'metadata')

In [None]:
def gen_answer(final_docs,bucket_name,PROJECT_ID):
    docs=[]

    client = storage.Client(project=PROJECT_ID)
    bucket=client.bucket(bucket_name)

    try:
        for doc in final_docs:
            try:
                if hasattr(doc,'metadata'):
                    metadata=doc.get('metadata','{}')
                    if metadata:
                        try:
                            id = metadata[0] if isinstance(metadata,list) else ""
                            gcs_uri=metadata[1]  if isinstance(metadata,list) else ""

                            if not id and not gcs_uri:
                                continue
                        except Exception as e:
                            print('Error:',e)

                        path=gcs_uri.replace("gs://","")
                        bucket_name_gcs,*blob_parts= path.split("/")
                        blob_name ="/".join(blob_parts)

                        bucket = client.bucket(bucket_name_gcs)
                        blob = bucket.blob(blob_name)

                        if not blob.exists():
                            continue

                        data = json.loads(blob.download_as_text())

                        raw_text = data.get("raw_text", "")
                        tables   = data.get("table_as_html", [])
                        images   = data.get("image_base64", [])

                        doc = Document(
                                page_content=raw_text,
                                metadata={
                                    "id": id,
                                    "tables": tables,
                                    "images": images,
                                    "gcs_uri": gcs_uri,
                                }
                            )

                        docs.append(doc)
            
            except Exception as e:
                raise e
        return docs
    except Exception as e:
        raise e
            

In [None]:
desc=gen_answer(final_docs,BUCKET_NAME,PROJECT_ID)

In [None]:
desc[3].page_content

In [None]:
from langchain_core.messages import HumanMessage

In [None]:
def gen_final_ans(results,query):
    try:
        llm = ChatGoogleGenerativeAI(model='gemini-2.5-pro', temperature=0.1)

        prompt_text = f"""
        Based on the following document context, please answer this question: {query}
        
        CONTENT_TO_ANALYZE:
        """
        all_images_base64 = []
        for i, chunk in enumerate(results):
            prompt_text += f"\n--- Document Fragment {i+1} ---\n"
            if hasattr(chunk,"page_content"):
                raw_text = chunk.page_content
                if raw_text:
                    prompt_text += f"Text:\n{raw_text}\n\n"
            
            if hasattr(chunk,"metdata"):
                table= chunk.metadata.get("tables",[])
                if table:
                    for j , cnt in enumerate(table):
                        prompt_text += f"Table {j+1}:\n{cnt}\n\n"
                imgs = chunk.metadata.get("images",[])
                if imgs:
                    all_images_base64.extend(imgs)
            
        prompt_text += """ 
            INSTRUCTIONS:
            Provide a clear, comprehensive answer using the text, tables, and images provided above. 
            If the documents don't contain sufficient information to answer the question, state: "I don't have enough information to answer the question."
            
            ANSWER:"""
        message_content = [{'type': 'text', 'text': prompt_text}]

        for img_b64 in all_images_base64:
            message_content.append({"type": "image_url","image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}})

        message = HumanMessage(content=message_content)
        response = llm.invoke([message])

        return response.content
        
    except Exception as e:
        print(f"Answer gen failed: {e}")
        return 'Sorry, I encountered an error generating the answer.'


In [None]:
ans=gen_final_ans(desc,query)

In [None]:
print(ans)

In [None]:
bm25.dump("bm25_1enc.json")