In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
from pinecone import Pinecone,ServerlessSpec
import google.generativeai as genai
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.schema.document import Document

  from tqdm.autonotebook import tqdm


In [2]:
pc=Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="mainragdemo",dimension=768,metric="cosine",spec=ServerlessSpec(cloud="aws",region="us-east-1")
)

In [3]:
# loader = PyMuPDFLoader("temp/document-1724842136077.pdf")
# data = loader.load()
# data[0]
def load_documents():
    loader=PyPDFDirectoryLoader("temp/")
    return loader.load()


In [4]:
def split_documents(documents:list[Document]):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False  
    )
    return text_splitter.split_documents(documents)

In [5]:
documents=load_documents()
# print(documents[0])
chunks=split_documents(documents)
print(chunks[0])

page_content='Tenzin Delek  
+91-8091086833 | tenzin.delek2021@vitstudent.ac.in  | GitHub  | LinkedIn  | Portfolio  | Leetcode  
 
SUMMARY:  
Aspiring Full stack Software Developer with a focus o n Frontend Development,  passionate about problem solving and 
using  technology to address real world challenges . 
TECHNICAL SKILLS:  
• Frontend – NextJS , JavaScript , React, TypeScript , Tailwind CSS, HTML , React -Three -Fiber  
• Backend  | Programming Language  – Node.js, PHP , Express.js, Perl , Python , Shell Scripting  
• Database  – MySQL, MongoDB , PostgreSQL , Supabase, Pinecone DB  
• Others  – Power BI, Excel , Unit Testing, Jest , Langchain , Prisma  
EDUCATION:  
Vellore Institute of Technology, Vellore                 CGPA: 9. 44 / 10' metadata={'source': 'temp\\document-1724943222137.pdf', 'page': 0}


In [6]:
# Initialize the embeddings model
embeddings_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
processdata = []
for i, chunk in enumerate(chunks):
    # Get the embedding
    embedding = embeddings_model.embed_query(chunk.page_content)
    
    # Create a unique ID for each chunk (e.g., by using the index in the loop)
    vector_id = f"chunk-{i}"
    
    # Append to processdata
    processdata.append({
        "id": vector_id,           # Required unique ID
        "values": embedding,       # The embedding vector
        "metadata": {
            "text":chunks[i].page_content,
        }    })




In [9]:
processdata[0]

{'id': 'chunk-0',
 'values': [0.029989222064614296,
  0.010975473560392857,
  -0.00694636395201087,
  -0.00964477937668562,
  0.06921552866697311,
  0.08006737381219864,
  0.04596365615725517,
  -0.0055670589208602905,
  0.007489281240850687,
  0.06289681047201157,
  -0.03916227072477341,
  0.013143500313162804,
  -0.06436993926763535,
  -0.042205654084682465,
  0.021371042355895042,
  -0.01002153567969799,
  0.025521283969283104,
  0.004388756118714809,
  -0.0014248768566176295,
  0.0028708435129374266,
  -0.0033184061758220196,
  -0.022820360958576202,
  0.006109247915446758,
  -0.016810342669487,
  0.0178048275411129,
  -0.008391555398702621,
  0.007217407692223787,
  -0.03699582442641258,
  -0.04943571239709854,
  0.013452564366161823,
  -0.09266562014818192,
  0.0043860930018126965,
  -0.07835543900728226,
  0.025660445913672447,
  0.00524168461561203,
  -0.027001770213246346,
  -0.006406704429537058,
  -0.007808932568877935,
  0.03342839330434799,
  0.014346800744533539,
  0.0224

In [10]:
#inserting in the pinecone vector
index=pc.Index("mainragdemo")
index.upsert(
    vectors=processdata,
    namespace="ns1"
)

{'upserted_count': 5}

In [11]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 5}},
 'total_vector_count': 5}