In [None]:
%pip install --upgrade --user google-cloud-aiplatform>=1.29.0 google-cloud-storage

In [None]:
import os

In [None]:
%pwd

In [None]:
os.chdir("../")

In [None]:
from google.cloud import aiplatform
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
import os
import json
from typing import List
from google.cloud import storage
import uuid
from langchain_google_vertexai import VectorSearchVectorStore
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from Multi_Modal.chunking import get_chunks
from Multi_Modal.SeperationAndSummarization import summarize_chunks
from conversion import convert_to_pdf
from pinecone import Pinecone,ServerlessSpec

In [None]:
load_dotenv()

In [None]:
PROJECT_ID = os.environ['PROJECT_ID']
REGION = os.environ['REGION']
BUCKET_NAME=os.environ['BUCKET_NAME']
INDEX_DISPLAY_NAME=os.environ['INDEX_DISPLAY_NAME']

In [None]:
aiplatform.init(project=PROJECT_ID,location=REGION)

embeddings= GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

In [None]:
def upload_vector_to_gcs(documents, bucket_name=BUCKET_NAME):

    client = storage.Client(project=PROJECT_ID)
    bucket = client.bucket(bucket_name)
    print("Completed Login")

    texts = []
    valid_indices = []
    
    for i, doc in enumerate(documents):
        try:
            raw_text = ""
            if hasattr(doc, "metadata") and "original_content" in doc.metadata:
                content_str = doc.metadata.get("original_content")
                if content_str:
                    data = json.loads(content_str)
                    raw_text = data.get('raw_text', "")
            
            if not raw_text:
                raw_text = doc.page_content 

            texts.append(raw_text)
            valid_indices.append(i)
            
        except Exception as e:
            print(f"Error parsing doc {i}: {e}")
            texts.append("") 

    print(f"Generating embeddings for {len(texts)} documents...")
    vectors = embeddings.embed_documents(texts)
    if vectors:
        print(f"!!! ACTUAL VECTOR DIMENSION: {len(vectors[0])} !!!") 


    vertex_content = []
    
    for i, doc_index in enumerate(valid_indices):
        doc = documents[doc_index]
        
        docs_id = str(uuid.uuid4()) 
        
        metadata_blob = bucket.blob(f"docstore/{docs_id}.json")
        
        try:
            content = json.loads(doc.metadata.get("original_content", "{}"))
        except:
            content = {}

        bucket_content = {
            'id': docs_id,
            'raw_text': content.get('raw_text', texts[i]), 
            'table_as_html': content.get('table_html', []),
            'image_base64': content.get('image_base64', [])
        }
        
        metadata_blob.upload_from_string(json.dumps(bucket_content))

        vertex_record = {
            "id": docs_id,
            "embedding": vectors[i] 
        }
        vertex_content.append(json.dumps(vertex_record))

    vector_data = "\n".join(vertex_content)
    
    unique_folder = f"init_vectors_{uuid.uuid4()}"
    
    blob_name = f"{unique_folder}/vectors.json"
    vector_blob = bucket.blob(blob_name)
    vector_blob.upload_from_string(vector_data)

    print(f"Success! Metadata in gs://{bucket_name}/docstore/")
    print(f"Vectors ready in gs://{bucket_name}/{unique_folder}/")
    
    gcs_uri = f"gs://{bucket_name}/{unique_folder}/"
    return gcs_uri


In [None]:
hasattr(docs[0],"page_content")

In [None]:
docs[0].metadata["original_content"]

In [None]:
def upload_files(documents: List[Document], index_name: str, bucket_name: str = BUCKET_NAME):


    client = storage.Client(project=PROJECT_ID)
    bucket = client.bucket(bucket_name)
    pc = Pinecone()

    print("Login successful to GCP & Pinecone")

    texts = []
    for i, doc in enumerate(documents):
        try:
            if hasattr(doc, "page_content") and doc.page_content:
                texts.append(doc.page_content)
            else:
                texts.append("")  
        except Exception as e:
            print(f"Error while parsing document {i}, error:", e)
            raise

    print(f"Generating embeddings for {len(texts)} documents...")
    vectors = embeddings.embed_documents(texts)
    if not vectors:
        raise ValueError("No embeddings generated; check your documents/embeddings model.")

    print(f"!!! ACTUAL VECTOR DIMENSION: {len(vectors[0])} !!!")
    dimension = len(vectors[0])

    print("Initializing index in Pinecone")
    existing_indexes = [i["name"] for i in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric="cosine",
            spec=ServerlessSpec(cloud="gcp", region="us-central1")
        )
        print(f"Created Pinecone index: {index_name}")
    else:
        print(f"Pinecone index '{index_name}' already exists")


    pinecone_docs: List[Document] = []

    for i, doc in enumerate(documents):
        doc_id = uuid.uuid4().hex[:15]

        try:
            original_content = doc.metadata.get("original_content", "{}")
            content = json.loads(original_content) if isinstance(original_content, str) else original_content
        except Exception:
            content = {}

        bucket_content = {
            "id": doc_id,
            "raw_text": content.get("raw_text", texts[i]),
            "table_as_html": content.get("table_html", []),
            "image_base64": content.get("image_base64", []),
        }

        metadata_blob = bucket.blob(f"docstore/{doc_id}.json")
        metadata_blob.upload_from_string(json.dumps(bucket_content))

        pinecone_doc = Document(
            page_content=texts[i],
            metadata={
                "id": doc_id,
                "gcs_uri": f"gs://{bucket_name}/docstore/{doc_id}.json",
                "source": doc.metadata.get("source", "unknown"),
            },
        )
        pinecone_docs.append(pinecone_doc)

    print("Upserting documents into Pinecone via LangChain...")
    vector_store = PineconeVectorStore.from_documents(
        documents=pinecone_docs,
        embedding=embeddings,
        index_name=index_name,
    )

    print("Ingestion complete: GCS + Pinecone hybrid store is ready.")
    return vector_store


In [None]:
str(uuid.uuid1().hex[:15])

In [None]:
def create_and_deploy_index(gcs_uri):

    import traceback
    print("Creating Index (takes time)...")

    dimensions = 3072
    approximate_neighbors_count = 150
    leaf_node_embedding_count = 100
    leaf_nodes_to_search_percent = 10
    distance_measure_type = "DOT_PRODUCT_DISTANCE"

    print("Index params:", {
        'dimensions': dimensions,
        'approximate_neighbors_count': approximate_neighbors_count,
        'leaf_node_embedding_count': leaf_node_embedding_count,
        'leaf_nodes_to_search_percent': leaf_nodes_to_search_percent,
        'distance_measure_type': distance_measure_type,
        'contents_delta_uri': gcs_uri,
    })

    try:
        my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
            display_name=INDEX_DISPLAY_NAME,
            contents_delta_uri=gcs_uri,
            dimensions=dimensions,
            approximate_neighbors_count=approximate_neighbors_count,
            leaf_node_embedding_count=leaf_node_embedding_count,
            leaf_nodes_to_search_percent=leaf_nodes_to_search_percent,
            distance_measure_type=distance_measure_type,
        )
    except Exception as e:
        print("Index creation failed. Exception repr:", repr(e))
        try:
            print("Exception type:", type(e))
            if hasattr(e, 'errors'):
                print("e.errors:", e.errors)
        except Exception as diag_exc:
            print("Error printing exception attributes:", diag_exc)
        print("Full traceback:")
        print(traceback.format_exc())
        raise

    print("Creating Endpoint")
    try:
        my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
            display_name=f"{INDEX_DISPLAY_NAME}_endpoint",
            public_endpoint_enabled=True
        )
    except Exception as e:
        print("Endpoint creation failed:", repr(e))
        print(traceback.format_exc())
        raise

    print("Deploying Index to Endpoint")
    try:
        my_index_endpoint.deploy_index(
            index=my_index,
            deployed_index_id="soumya_deployed_v1",
        )
    except Exception as e:
        print("Deploy failed:", repr(e))
        print(traceback.format_exc())
        raise

    print("deployment Completed")
    return my_index_endpoint


In [None]:
def search_vertex_ai(query,index_endpoint, deployed_index_id, bucket_name=BUCKET_NAME):

    query_emb = embeddings.embed_query(query)

    response = index_endpoint.find_neighbors(
        deployed_index_id=deployed_index_id,
        queries = [query_emb],
        num_neighbors=5
    )

    print("Response:",response)


    results=[]
    client = storage.Client(project=PROJECT_ID)
    bucket = client.bucket(bucket_name)
    print("Login successed")

    for neighbor in response[0]:
        doc_id = neighbor.id
        score = neighbor.distance

        blob = bucket.blob(f"docstore/{doc_id}.json")
        if blob.exists():
            data = json.loads(blob.download_as_text())

            doc = Document(
                page_content=data.get('raw_text', ""), 
                metadata={
                    "tables": data.get('table_as_html', []),
                    "images": data.get('image_base64', []), 
                }
            )


        results.append(doc)
        
    return results


In [None]:
file_path = "pdfs/Documentation-Project.pdf"
output_dir = "temp_uploads"

In [None]:
from pathlib import Path

In [None]:
output_dir= Path(output_dir)
output_dir.mkdir(exist_ok=True)

In [None]:
query1= "Explain the workflow of the project. explain the tech stack used to build the project and tell how the query is procceed when a query is given to the final model"

In [None]:
print("Creating Chunks")
chunks = get_chunks(file_path)
print("Chunks Created")

print("summarize_chunks")
docs = summarize_chunks(chunks)
print("Summarization Completed")

In [None]:
docs[0].page_content

In [None]:
docs[1].metadata.get('original_content')

In [None]:
vector_store=upload_files(docs,"soumya-index",BUCKET_NAME)

In [None]:
pc=Pinecone()
index = pc.Index(host="https://soumya-index-fughegr.svc.gcp-us-central1-4a9f.pinecone.io")

In [None]:
def hybrid_pinecone_retriever(query,index_name,top_k=5,bucket_name=BUCKET_NAME):

    pc=Pinecone()
    index = pc.Index(index_name)
    client = storage.Client(project=PROJECT_ID)
    bucket=client.bucket(bucket_name)

    print("Login successful to GCP & Pinecone")

    emb_query = embeddings.embed_query(query)

    res= index.query(
        vector=emb_query,
        top_k=top_k,
        include_metadata=True
    )
    docs=[]
    try:
        for result in res['matches']:
            try:
                if "metadata" in result:
                    metadata = result.get("metadata","{}")
                    if metadata:
                        id=metadata.get("id","")
                        score= result["score"]
                        gcs_uri=metadata.get('gcs_uri',"")

                        if not gcs_uri:
                            continue
                    
                        path=gcs_uri.replace("gs://","")
                        bucket_name_gcs,*blob_parts= path.split("/")
                        blob_name ="/".join(blob_parts)

                        print(path)
                        print(bucket_name," ",blob_parts)

                        bucket = client.bucket(bucket_name_gcs)
                        blob = bucket.blob(blob_name)

                        if not blob.exists():
                            continue

                        data = json.loads(blob.download_as_text())

                        raw_text = data.get("raw_text", "")
                        tables   = data.get("table_as_html", [])
                        images   = data.get("image_base64", [])

                        doc = Document(
                            page_content=raw_text,
                            metadata={
                                "id": id,
                                "score": score,
                                "tables": tables,
                                "images": images,
                                "gcs_uri": gcs_uri,
                            }
                        )

                        docs.append(doc)

            except Exception as e:
                print("Error , Unable to fetch metadata",e)

        return docs
    except Exception as e:
        print(f"Error:{e}")

In [None]:
document=hybrid_pinecone_retriever(query1,"soumya-index")

In [None]:
retriever = vector_store.as_retriever(search_kwargs={"k":10})

In [None]:
print("upload to GCS")
gcs_uri= upload_vector_to_gcs(docs)
print("Uploaded Completed")



In [None]:
index_endpoint= create_and_deploy_index(gcs_uri)
print('Completed deploying index')

In [None]:
endpoints = aiplatform.MatchingEngineIndexEndpoint.list()
for ep in endpoints:
    print(f"Name: {ep.display_name}")
    print(f"ID: {ep.name}")  


In [None]:
my_endpoint = aiplatform.MatchingEngineIndexEndpoint(
    index_endpoint_name=f"projects/{os.environ['PROJECT_NO']}/locations/{REGION}/indexEndpoints/{os.environ['ENDPOINT_ID']}"
)


In [None]:
results = search_vertex_ai(
    query=query, 
    index_endpoint=my_endpoint,      
    deployed_index_id="soumya_deployed_v1", 
    bucket_name=BUCKET_NAME
)

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage
import base64


In [None]:
results[0].metadata

In [None]:
hasattr(results[0],"page_content")

In [None]:
def gen_final_ans(results,query):
    try:
        llm = ChatGoogleGenerativeAI(model='gemini-2.5-pro', temperature=0.1)

        prompt_text = f"""
        Based on the following document context, please answer this question: {query}
        
        CONTENT_TO_ANALYZE:
        """
        all_images_base64 = []
        for i, chunk in enumerate(results):
            prompt_text += f"\n--- Document Fragment {i+1} ---\n"
            if hasattr(chunk,"page_content"):
                raw_text = chunk.page_content
                if raw_text:
                    prompt_text += f"Text:\n{raw_text}\n\n"
            
            if hasattr(chunk,"metdata"):
                table= chunk.metadata.get("tables",[])
                if table:
                    for j , cnt in enumerate(table):
                        prompt_text += f"Table {j+1}:\n{cnt}\n\n"
                imgs = chunk.metadata.get("images",[])
                if imgs:
                    all_images_base64.extend(imgs)
            
        prompt_text += """ 
            INSTRUCTIONS:
            Provide a clear, comprehensive answer using the text, tables, and images provided above. 
            If the documents don't contain sufficient information to answer the question, state: "I don't have enough information to answer the question."
            
            ANSWER:"""
        message_content = [{'type': 'text', 'text': prompt_text}]

        for img_b64 in all_images_base64:
            message_content.append({"type": "image_url","image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}})

        message = HumanMessage(content=message_content)
        response = llm.invoke([message])

        return response.content
        
    except Exception as e:
        print(f"Answer gen failed: {e}")
        return 'Sorry, I encountered an error generating the answer.'

                


            


In [None]:
gen_final_ans(results,query) 

In [None]:
print('Based on the document context provided, here is an explanation of the project\'s workflow, its technology stack, and how a query is processed.\n\n### Project Workflow\n\nThe project operates as a decoupled microservice architecture with distinct frontend and backend services. The end-to-end workflow is as follows:\n\n1.  **User Input**: A user interacts with the client application (frontend) by entering a news article and submitting it for classification.\n2.  **API Request**: The frontend application sends the article text via an asynchronous HTTP request to the exposed backend API endpoint, which is built with FastAPI.\n3.  **Backend Processing**: The backend receives the text and initiates a two-part verification process:\n    *   **Model Prediction**: The text is first classified by the fine-tuned DeBERTa model, which returns an initial prediction ("Fake" or "True").\n    *   **External Validation**: Simultaneously, the application makes an external API call to a live information source (like a news archive or fact-checking service) to gather real-time context.\n4.  **Cross-Validation**: The information from the external source is used to cross-check and validate the DeBERTa model\'s initial prediction, enhancing the system\'s temporal robustness.\n5.  **Response**: The backend sends the final, validated classification result back to the frontend.\n6.  **Result Display**: The frontend receives the binary result ("Fake" or "True") and renders it to the user in a clear format.\n\n### Technology Stack\n\nThe project utilizes a modern technology stack for its frontend, backend, machine learning model, and deployment.\n\n**Frontend:**\n*   **Build Tool**: **Vite.js** was used for fast development and optimized bundling.\n*   **Language**: **TypeScript** was implemented for strong type safety.\n*   **Styling**: **Tailwind CSS** was used for its utility-first approach to rapid and responsive design.\n\n**Backend:**\n*   **API Framework**: **FastAPI** was chosen for its high performance and asynchronous capabilities, making it ideal for serving the ML model.\n\n**Machine Learning:**\n*   **Core Model**: **microsoft/deberta-v3-base**, a Transformer-based model from Hugging Face.\n*   **ML Libraries**: **PyTorch** and **Hugging Face Transformers** were used for building, training, and managing the model.\n*   **Training Management**: The **Hugging Face Trainer API** was used to simplify the fine-tuning process.\n\n**Deployment & Infrastructure:**\n*   **Containerization**: **Docker** was used to package the frontend and backend applications into separate, reproducible containers.\n*   **Cloud Platform**: **Google Cloud Platform (GCP)** was used to host and serve both the containerized frontend and backend services, ensuring high availability and scalability.\n\n### Query Processing by the Final Model\n\nWhen a user submits a query (a news article), the backend system processes it through a hybrid approach designed to ensure accuracy and temporal relevance:\n\n1.  **Initial Classification**: The raw text from the query is first fed into the fine-tuned **DeBERTa classification model**. This model analyzes the linguistic patterns in the text based on its training and produces a primary prediction of "Fake" or "True".\n2.  **External Cross-Validation**: To overcome the limitation of the model\'s knowledge being restricted to its training data (up to 2019), the system simultaneously makes an **external API call** to a live service like a news archive or a fact-checking explorer.\n3.  **Verdict Augmentation**: The information retrieved from this external source—such as source veracity or previously debunked claims—is used to **cross-check and validate** the DeBERTa model\'s prediction.\n4.  **Final Verdict**: This hybrid process ensures the final verdict is not solely reliant on past linguistic patterns but is also enhanced with real-time, contextual evidence, thereby improving the overall trustworthiness of the classification.')