In [2]:
from dotenv import load_dotenv
load_dotenv()
import os
from pinecone import Pinecone,ServerlessSpec
import google.generativeai as genai
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.schema.document import Document

  from tqdm.autonotebook import tqdm


In [3]:
pc=Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="mainrag1",dimension=768,metric="cosine",spec=ServerlessSpec(cloud="aws",region="us-east-1")
)

In [4]:
# loader = PyMuPDFLoader("temp/document-1724842136077.pdf")
# data = loader.load()
# data[0]
def load_documents():
    loader=PyPDFDirectoryLoader("temp/")
    return loader.load()


In [6]:
def split_documents(documents:list[Document]):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False  
    )
    return text_splitter.split_documents(documents)

In [7]:
documents=load_documents()
# print(documents[0])
chunks=split_documents(documents)
print(chunks[0])

page_content='Tenzin Delek  
+91-8091086833 | tibetdelek@gmail.com  | GitHub  | LinkedIn  | Portfolio  | Leetcode  
 
SUMMARY:  
Aspiring Full stack Software Developer with a focus o n Frontend Development,  passionate about problem solving and 
using  technology to address real world challenges . 
TECHNICAL SKILLS:  
• Frontend – NextJS , JavaScript , React, TypeScript , Tailwind CSS, HTML , React -Three -Fiber  
• Backend  | Programming Language  – Node.js, PHP , Express.js, Perl , Python , Shell Scripting  
• Database  – MySQL, MongoDB , Firebase  
• Others  – Power BI, Excel , Unit Te sting, Jest 
EDUCATION:  
Vellore Institute of Technology, Vellore                 CGPA: 9. 44 / 10 
Bachelor ’s Degree in Computer Applications  | September 2021 to May 2024  
 
EXPE RIENCE:' metadata={'source': 'temp\\document-1724842136077.pdf', 'page': 0}


In [11]:
# Initialize the embeddings model
embeddings_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
processdata = []
for i, chunk in enumerate(chunks):
    # Get the embedding
    embedding = embeddings_model.embed_query(chunk.page_content)
    
    # Create a unique ID for each chunk (e.g., by using the index in the loop)
    vector_id = f"chunk-{i}"
    
    # Append to processdata
    processdata.append({
        "id": vector_id,           # Required unique ID
        "values": embedding,       # The embedding vector
        "metadata": {
            "text":chunks[i].page_content,
        }    })




In [12]:
processdata[0]

{'id': 'chunk-0',
 'values': [0.03829656541347504,
  0.018318505957722664,
  -0.005607785657048225,
  -0.007656094618141651,
  0.06718585640192032,
  0.08031639456748962,
  0.0525469109416008,
  -0.015531315468251705,
  0.013821178115904331,
  0.06040608137845993,
  -0.03579435870051384,
  0.010880823247134686,
  -0.06421057134866714,
  -0.05621317774057388,
  0.02154957689344883,
  -0.016825992614030838,
  0.011369085870683193,
  0.01244595367461443,
  0.004270429722964764,
  0.015163511037826538,
  -0.005476294085383415,
  -0.022898191586136818,
  0.0045959847047924995,
  -0.02638186514377594,
  0.02346722036600113,
  -0.008325977250933647,
  0.011993291787803173,
  -0.04147801920771599,
  -0.03971051424741745,
  0.03229585289955139,
  -0.07575350999832153,
  0.011758028529584408,
  -0.07296915352344513,
  0.026547525078058243,
  0.0055393981747329235,
  -0.02493240498006344,
  -0.024604814127087593,
  -0.01753605529665947,
  0.02593342587351799,
  0.007438252680003643,
  0.012245680

In [13]:
index=pc.Index("mainrag1")
index.upsert(
    vectors=processdata,
    namespace="ns1"
)

{'upserted_count': 5}

In [14]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}