In [1]:
%pip install --upgrade --user google-cloud-aiplatform>=1.29.0 google-cloud-storage

Note: you may need to restart the kernel to use updated packages.


ERROR: Can not perform a '--user' install. User site-packages are not visible in this virtualenv.


In [1]:
import os

In [2]:
os.chdir("../")

In [3]:
from google.cloud import aiplatform
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
import os
import json
from google.cloud import storage
import uuid
from langchain_google_vertexai import VectorSearchVectorStore
from langchain_core.documents import Document
from Multi_Modal.chunking import get_chunks
from Multi_Modal.SeperationAndSummarization import summarize_chunks
from conversion import convert_to_pdf


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
load_dotenv()

True

In [5]:
PROJECT_ID = os.environ['PROJECT_ID']
REGION = os.environ['REGION']
BUCKET_NAME=os.environ['BUCKET_NAME']
INDEX_DISPLAY_NAME=os.environ['INDEX_DISPLAY_NAME']

In [6]:
BUCKET_NAME

'vecor_embedding002'

In [7]:
REGION

'us-central1'

In [8]:
aiplatform.init(project=PROJECT_ID,location=REGION)

embeddings= GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

In [10]:
def upload_vector_to_gcs(documents, bucket_name=BUCKET_NAME):

    client = storage.Client(project=PROJECT_ID)
    bucket = client.bucket(bucket_name)
    print("Completed Login")

    texts = []
    valid_indices = []
    
    for i, doc in enumerate(documents):
        try:
            raw_text = ""
            if hasattr(doc, "metadata") and "original_content" in doc.metadata:
                content_str = doc.metadata.get("original_content")
                if content_str:
                    data = json.loads(content_str)
                    raw_text = data.get('raw_text', "")
            
            if not raw_text:
                raw_text = doc.page_content 

            texts.append(raw_text)
            valid_indices.append(i)
            
        except Exception as e:
            print(f"Error parsing doc {i}: {e}")
            texts.append("") 

    print(f"Generating embeddings for {len(texts)} documents...")
    vectors = embeddings.embed_documents(texts)
    if vectors:
        print(f"!!! ACTUAL VECTOR DIMENSION: {len(vectors[0])} !!!") 


    vertex_content = []
    
    for i, doc_index in enumerate(valid_indices):
        doc = documents[doc_index]
        
        docs_id = str(uuid.uuid4()) 
        
        metadata_blob = bucket.blob(f"docstore/{docs_id}.json")
        
        try:
            content = json.loads(doc.metadata.get("original_content", "{}"))
        except:
            content = {}

        bucket_content = {
            'id': docs_id,
            'raw_text': content.get('raw_text', texts[i]), 
            'table_as_html': content.get('table_html', []),
            'image_base64': content.get('image_base64', [])
        }
        
        metadata_blob.upload_from_string(json.dumps(bucket_content))

        vertex_record = {
            "id": docs_id,
            "embedding": vectors[i] 
        }
        vertex_content.append(json.dumps(vertex_record))

    vector_data = "\n".join(vertex_content)
    
    unique_folder = f"init_vectors_{uuid.uuid4()}"
    
    blob_name = f"{unique_folder}/vectors.json"
    vector_blob = bucket.blob(blob_name)
    vector_blob.upload_from_string(vector_data)

    print(f"Success! Metadata in gs://{bucket_name}/docstore/")
    print(f"Vectors ready in gs://{bucket_name}/{unique_folder}/")
    
    gcs_uri = f"gs://{bucket_name}/{unique_folder}/"
    return gcs_uri


In [11]:
def create_and_deploy_index(gcs_uri):

    import traceback
    print("Creating Index (takes time)...")

    # Parameters (print them so we can debug what is sent to the API)
    dimensions = 3072
    approximate_neighbors_count = 150
    leaf_node_embedding_count = 100
    leaf_nodes_to_search_percent = 10
    distance_measure_type = "DOT_PRODUCT_DISTANCE"

    print("Index params:", {
        'dimensions': dimensions,
        'approximate_neighbors_count': approximate_neighbors_count,
        'leaf_node_embedding_count': leaf_node_embedding_count,
        'leaf_nodes_to_search_percent': leaf_nodes_to_search_percent,
        'distance_measure_type': distance_measure_type,
        'contents_delta_uri': gcs_uri,
    })

    try:
        my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
            display_name=INDEX_DISPLAY_NAME,
            contents_delta_uri=gcs_uri,
            dimensions=dimensions,
            approximate_neighbors_count=approximate_neighbors_count,
            # Required algorithm config params for Tree AH
            leaf_node_embedding_count=leaf_node_embedding_count,
            leaf_nodes_to_search_percent=leaf_nodes_to_search_percent,
            distance_measure_type=distance_measure_type,
        )
    except Exception as e:
        # Print rich diagnostics to help debug FAILED_PRECONDITION
        print("Index creation failed. Exception repr:", repr(e))
        try:
            print("Exception type:", type(e))
            # some google exceptions carry extra attributes
            if hasattr(e, 'errors'):
                print("e.errors:", e.errors)
            if hasattr(e, 'message'):
                print("e.message:", e.message)
        except Exception as diag_exc:
            print("Error printing exception attributes:", diag_exc)
        print("Full traceback:")
        print(traceback.format_exc())
        raise

    print("Creating Endpoint")
    try:
        my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
            display_name=f"{INDEX_DISPLAY_NAME}_endpoint",
            public_endpoint_enabled=True
        )
    except Exception as e:
        print("Endpoint creation failed:", repr(e))
        print(traceback.format_exc())
        raise

    print("Deploying Index to Endpoint")
    try:
        my_index_endpoint.deploy_index(
            index=my_index,
            deployed_index_id="soumya_deployed_v1",
        )
    except Exception as e:
        print("Deploy failed:", repr(e))
        print(traceback.format_exc())
        raise

    print("deployment Completed")
    return my_index_endpoint


In [32]:
def search_vertex_ai(query,index_endpoint, deployed_index_id, bucket_name=BUCKET_NAME):

    query_emb = embeddings.embed_query(query)

    response = index_endpoint.find_neighbors(
        deployed_index_id=deployed_index_id,
        queries = [query_emb],
        num_neighbors=5
    )

    print("Response:",response)


    results=[]
    client = storage.Client(project=PROJECT_ID)
    bucket = client.bucket(bucket_name)
    print("Login successed")

    for neighbor in response[0]:
        doc_id = neighbor.id
        score = neighbor.distance

        blob = bucket.blob(f"docstore/{doc_id}.json")
        if blob.exists():
            data = json.loads(blob.download_as_text())

            # In search_vertex_ai...
            doc = Document(
                page_content=data.get('raw_text', ""), 
                metadata={
                    "tables": data.get('table_as_html', []),
                    "images": data.get('image_base64', []), 
                    "score": score
                }
            )


        results.append((doc,score))
        
    return results


In [13]:
%pwd

'd:\\Cite-What-You-Type'

In [14]:
file_path = "pdfs/Documentation-Project.pdf"
output_dir = "temp_uploads"

In [15]:
from pathlib import Path

In [16]:
output_dir= Path(output_dir)
output_dir.mkdir(exist_ok=True)

In [17]:
query= "Explain the workflow of the project. explain the tech stack used to build the project and tell how the query is procceed when a query is given to the final model"

In [None]:
print("Creating Chunks")
chunks = get_chunks(file_path)
print("Chunks Created")

print("summarize_chunks")
docs = summarize_chunks(chunks)
print("Summarization Completed")



In [19]:
docs[1].metadata.get('original_content')

'{"raw_text": "Advertisement and Publicity Team\\n\\n1.Shradhanjali Das (124AI0024) 2.Gangala Tanishka (124AI0005) 3.Suraj Kumar Sahu (124AI0013)\\n\\nPROBLEM STATEMENT :\\n\\nTitle: Automated Fake News Detection System Development\\n\\nIn the modern digital age, the rapid dissemination of unverified and misleading information\\u2014commonly referred to as \\"fake news\\" or \\"information disorder\\"\\u2014poses a critical threat to democratic processes, public health, and social stability. The sheer volume and velocity of content generated daily across platforms make manual verification impossible, necessitating advanced automated solutions. The ability of malicious actors to use sophisticated techniques to spread hoaxes and disinformation undermines trust in legitimate news sources and contributes to polarization.\\n\\nThis project focuses on the development of a robust and scalable Fake News Detection (FND) system designed to identify and flag deceptive articles. The system leverag

In [20]:
print("upload to GCS")
gcs_uri= upload_vector_to_gcs(docs)
print("Uploaded Completed")



upload to GCS
Completed Login
Generating embeddings for 14 documents...
Completed Login
Generating embeddings for 14 documents...
!!! ACTUAL VECTOR DIMENSION: 3072 !!!
!!! ACTUAL VECTOR DIMENSION: 3072 !!!
Success! Metadata in gs://vecor_embedding002/docstore/
Vectors ready in gs://vecor_embedding002/init_vectors_8be84b8c-5da2-4b61-b02e-885a53cbb735/
Uploaded Completed
Success! Metadata in gs://vecor_embedding002/docstore/
Vectors ready in gs://vecor_embedding002/init_vectors_8be84b8c-5da2-4b61-b02e-885a53cbb735/
Uploaded Completed


In [21]:
gcs_uri

'gs://vecor_embedding002/init_vectors_8be84b8c-5da2-4b61-b02e-885a53cbb735/'

In [59]:
from google.cloud import storage
import json

# Paste the URI that failed here
FAILED_URI = gcs_uri

def inspect_gcs_data(uri):
    bucket_name = uri.split("/")[2]
    prefix = "/".join(uri.split("/")[3:])
    
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=prefix))
    
    print(f"Found {len(blobs)} files in {uri}")
    
    for blob in blobs:
        print(f"\nChecking file: {blob.name} ({blob.size} bytes)")
        content = blob.download_as_text()
        lines = content.strip().split('\n')
        print(f"Total lines: {len(lines)}")
        
        if len(lines) > 0:
            first_line = json.loads(lines[0])
            print("First record keys:", first_line.keys())
            if 'embedding' in first_line:
                print("Vector dim:", len(first_line['embedding']))
            else:
                print("CRITICAL ERROR: 'embedding' key missing!")

inspect_gcs_data(FAILED_URI)


Found 1 files in gs://embeddings_store001/init_vectors_e5614764-d6d7-47e5-988f-bdda705e9a4a/

Checking file: init_vectors_e5614764-d6d7-47e5-988f-bdda705e9a4a/vectors.json (594878 bytes)
Total lines: 14
First record keys: dict_keys(['id', 'embedding'])
Vector dim: 3072


In [23]:
index_endpoint= create_and_deploy_index(gcs_uri)
print('Completed deploying index')



Creating Index (takes time)...
Index params: {'dimensions': 3072, 'approximate_neighbors_count': 150, 'leaf_node_embedding_count': 100, 'leaf_nodes_to_search_percent': 10, 'distance_measure_type': 'DOT_PRODUCT_DISTANCE', 'contents_delta_uri': 'gs://vecor_embedding002/init_vectors_8be84b8c-5da2-4b61-b02e-885a53cbb735/'}
Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/908027936872/locations/us-central1/indexes/7164041733611716608/operations/8405778952831893504
Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/908027936872/locations/us-central1/indexes/7164041733611716608/operations/8405778952831893504
MatchingEngineIndex created. Resource name: projects/908027936872/locations/us-central1/indexes/7164041733611716608
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/908027936872/locations/us-central1/indexes/7164041733611716608')
MatchingEngineIndex created. Resource name: projects/

In [24]:
index_endpoint

<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x0000017710098830> 
resource name: projects/908027936872/locations/us-central1/indexEndpoints/6066896254858690560

In [33]:
results = search_vertex_ai(
    query="machine learning", 
    index_endpoint=index_endpoint,      
    deployed_index_id="soumya_deployed_v1", 
    bucket_name=BUCKET_NAME
)

Response: [[MatchNeighbor(id='195b5f53-90c8-4d3d-aab8-b9945cae468b', distance=0.6637460589408875, sparse_distance=None, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[]), MatchNeighbor(id='5d0458ad-c1f5-4bce-98cf-c4e964764b98', distance=0.6550101041793823, sparse_distance=None, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[]), MatchNeighbor(id='d78cd1a2-5669-4d2d-9705-d6d24f867768', distance=0.6543611288070679, sparse_distance=None, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[]), MatchNeighbor(id='ca3d2ff9-8201-4631-8e9c-3dbf5d0fc592', distance=0.6533534526824951, sparse_distance=None, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[]), MatchNeighbor(id='3b

In [34]:
results

[(Document(metadata={'tables': [], 'images': [], 'score': 0.6637460589408875}, page_content='Advertisement and Publicity Team\n\n1.Shradhanjali Das (124AI0024) 2.Gangala Tanishka (124AI0005) 3.Suraj Kumar Sahu (124AI0013)\n\nPROBLEM STATEMENT :\n\nTitle: Automated Fake News Detection System Development\n\nIn the modern digital age, the rapid dissemination of unverified and misleading information—commonly referred to as "fake news" or "information disorder"—poses a critical threat to democratic processes, public health, and social stability. The sheer volume and velocity of content generated daily across platforms make manual verification impossible, necessitating advanced automated solutions. The ability of malicious actors to use sophisticated techniques to spread hoaxes and disinformation undermines trust in legitimate news sources and contributes to polarization.\n\nThis project focuses on the development of a robust and scalable Fake News Detection (FND) system designed to identify

In [55]:
chunk,score = results[0]

In [56]:
score

0.6637460589408875

In [52]:
chunk.page_content

'Advertisement and Publicity Team\n\n1.Shradhanjali Das (124AI0024) 2.Gangala Tanishka (124AI0005) 3.Suraj Kumar Sahu (124AI0013)\n\nPROBLEM STATEMENT :\n\nTitle: Automated Fake News Detection System Development\n\nIn the modern digital age, the rapid dissemination of unverified and misleading information—commonly referred to as "fake news" or "information disorder"—poses a critical threat to democratic processes, public health, and social stability. The sheer volume and velocity of content generated daily across platforms make manual verification impossible, necessitating advanced automated solutions. The ability of malicious actors to use sophisticated techniques to spread hoaxes and disinformation undermines trust in legitimate news sources and contributes to polarization.\n\nThis project focuses on the development of a robust and scalable Fake News Detection (FND) system designed to identify and flag deceptive articles. The system leverages a curated, static training corpus, such a

In [35]:
from langchain_google_genai import ChatGoogleGenerativeAI

In [57]:
def gen_final_ans(chunks, query):
    try:
        llm = ChatGoogleGenerativeAI(model='gemini-2.5-pro', temperature=0.1)

        prompt_text = f"""
        Based on the following document context, please answer this question: {query}
        
        CONTENT_TO_ANALYZE:
        """
        all_images_base64 = []
        for content in chunks:
            chunk,score=content
            
            prompt_text += f"\n--- Document Fragment {i+1} ---\n"
            
            if hasattr(chunk, "page_content"):
                try:
                    # original_data = json.loads(chunk.metadata['original_content'])
                    
                    raw_text = chunk.page_content
                    if raw_text:
                        prompt_text += f"Text:\n{raw_text}\n\n"

                    tables_html = chunk.metadata.get("tables", [])
                    if tables_html:
                        prompt_text += 'TABLES:\n'
                        for j, table in enumerate(tables_html):
                            prompt_text += f"Table {j+1}:\n{table}\n\n"
                    
                    
                    imgs = chunk.metadata.get("images", [])
                    if imgs:
                        all_images_base64.extend(imgs)

                except json.JSONDecodeError:
                    prompt_text += f"Text:\n{chunk.page_content}\n\n"
            else:
                prompt_text += f"Text:\n{chunk.page_content}\n\n"

        prompt_text += """ 
        INSTRUCTIONS:
        Provide a clear, comprehensive answer using the text, tables, and images provided above. 
        If the documents don't contain sufficient information to answer the question, state: "I don't have enough information to answer the question."
        
        ANSWER:"""
    
        message_content = [{'type': 'text', 'text': prompt_text}]

        for img_b64 in all_images_base64:
            message_content.append({"type": "image_url","image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}})

        message = HumanMessage(content=message_content)
        response = llm.invoke([message])

        return response.content
        
    except Exception as e:
        print(f"Answer gen failed: {e}")
        return 'Sorry, I encountered an error generating the answer.'

In [58]:
gen_final_ans(query,results)

Answer gen failed: not enough values to unpack (expected 2, got 1)


'Sorry, I encountered an error generating the answer.'