In [None]:
from google.cloud import storage
from dotenv import load_dotenv
import os
from pathlib import Path
import uuid
import json
from Multi_Modal.chunking import get_chunks
from Multi_Modal.SeperationAndSummarization import summarize_chunks
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

In [None]:
load_dotenv()

In [None]:
%pwd

In [None]:
os.chdir("../")

In [None]:
PDF_BUCKET_NAME = os.environ['PDF_BUCKET_NAME']
project_id=os.environ['PROJECT_ID']

In [None]:
from pathlib import Path
from google.cloud import storage
import uuid

def upload_file_to_gcp(local_file_path):
    try:
        unique_value = str(uuid.uuid4())
        
        file_name = Path(local_file_path).name
        dest_name = f"uploads/{unique_value}_{file_name}"
        
        storage_client = storage.Client(project=project_id)
        bucket = storage_client.bucket(PDF_BUCKET_NAME)

        print("Successfully logged in gcp")
        print(dest_name)

        blob = bucket.blob(dest_name)

        blob.upload_from_filename(str(local_file_path))

        print(f"File {local_file_path} uploaded to gs://{PDF_BUCKET_NAME}/{dest_name}.")
        return f"gs://{PDF_BUCKET_NAME}/{dest_name}"

    except Exception as e:
        print("Could not upload file to GCP:", e)


In [None]:
path="pdfs/Documentation-Project.pdf"

In [None]:
uploded_path =upload_file_to_gcp(path)

In [None]:
documents=get_chunks(path)

In [None]:
docs=summarize_chunks(documents)

In [None]:
user_name =os.environ['MONGO_DB_USER']
password = os.environ['MONGO_DB_PASS']
cluster=os.environ['MONGO_DB_CLUSTER']


In [None]:
uri=f"mongodb+srv://soumyaranjanbhoi0011_db_user:{password}@cluster0.dc1mcv.mongodb.net/?appName=Cluster0"

In [None]:
client = MongoClient(uri,server_api=ServerApi('1'))

try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

In [None]:
docs[0].page_content

In [None]:
json.loads(docs[0].metadata["original_content"])

In [None]:
embeddings=GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

In [None]:
def inject_data(doc):
    content={}
    embeddings=GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")
    
    try:
        if hasattr(doc,"metadata") and "original_content" in doc.metadata:
            json_data = json.loads(doc.metadata['original_content'])
            content['text']=json_data['raw_text']
            content['table_html']=json_data['table_html']
            content['images']=json_data['image_base64']

            if hasattr(doc,"page_content"):
                content['embeddings']=embeddings.embed_documents(doc.page_content)[0]
            else:
                content['embeddings']=embeddings.embed_documents(json_data['raw_text'])[0]

        return content
    except Exception as e:
        print("Error:",e)
        raise

In [None]:
def get_embeddings(text):
    return {
        "text":text,
        "embeddings":embeddings.embed_documents(text)[0]
    }

In [None]:
docs_to_inject=[
    get_embeddings(doc.page_content) for doc in docs
]

In [None]:
db=client["vector_db"]
collections=db['mycollections']

In [None]:
collections.insert_many(docs_to_inject)

In [None]:
from pymongo.operations import SearchIndexModel
import time

In [None]:
index_name="vector_index"
search_index_model=SearchIndexModel(
    definition={
        "fields":[
            {
                "type":"vector",
                "numDimensions":3072,
                "path":"embeddings",
                "similarity":"cosine"
            }
        ]
    },
    name=index_name,
    type="vectorSearch"
)

collections.create_search_index(model=search_index_model)

In [None]:
query="tell me about the project"
query_emb =embeddings.embed_query(query)

In [None]:
query1= "Explain the workflow of the project. explain the tech stack used to build the project and tell how the query is procceed when a query is given to the final model"

In [None]:
def get_query_result(query):

    query_emb= embeddings.embed_query(query)

    pipeline=[
        {
        "$vectorSearch":{
            "index":"vector_index",
            "path":"embeddings",
            "queryVector":query_emb,
            "numCandidates":3072,
            "limit":3
        }
    },{
        "$project":{
            "text":1,
            "score": {"$meta": "vectorSearchScore"}

        }
    }
    ]

    results=collections.aggregate(pipeline)
    ans=[]
    for doc in results:
        ans.append(doc)
    return ans


In [None]:
get_query_result(query)