In [1]:
from langchain_together import TogetherEmbeddings
from langchain_core.documents import Document
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pinecone
from pinecone import Pinecone, ServerlessSpec
import os

In [2]:
# Set API keys
os.environ["TOGETHER_API_KEY"] = "tgp_v1_Gdl66OKThh1KsJjEym9JEgDMqFWqd6bXtlZhviYqf34"
os.environ["PINECONE_API_KEY"] = "pcsk_3875g1_PSfiVC6hgEBa7mPwUMFf6dbhmZa68JiueGaf5eSYDwKoyt8JABHRYsirkcLfRnm"
os.environ["PINECONE_ENVIRONMENT"] = "gcp-starter"  # Replace with your Pinecone environment

In [3]:
# Initialize Pinecone

pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

print(pc.list_indexes())

{'indexes': [{'dimension': 768,
              'host': 'langchain-embeddings-h7crht6.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'langchain-embeddings',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 1536,
              'host': 'movie-index-h7crht6.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'movie-index',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}


In [4]:
index_name = "langchain-embeddings"
if index_name not in pc.list_indexes().names():
    pc.create_index(
            name=index_name,
            dimension=768,  # Adjust dimension as per embedding model
            metric='cosine',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )  # Adjust dimension as per embedding model
    print("Index created:", index_name)
index = pc.Index(index_name)

In [5]:
filepath = r"D:\Python_01\Python_01\Langchain01\Introduction_to_Data_and_Data_Science.pdf"

In [6]:
reader = PdfReader(filepath)
text = ""
for page in reader.pages:
    text += page.extract_text()

In [7]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_text(text)
print(chunks[0])

Analysis vs Analytics 
Alright! So… 
Let’s discuss the not-so-obvious differences 
between the terms analysis and analytics. 
Due to the similarity of the words, some people 
believe they share the same meaning, and thus 
use them interchangeably. Technically, this 
isn’t correct. There is, in fact, a distinct 
difference between the two. And the reason 
for one often being used instead of the other 
is the lack of a transparent understanding 
of both. 
So, let’s clear this up, shall we? 
First, we will start with analysis. 
Consider the following… 
You have a huge dataset containing data of 
various types. Instead of tackling the entire 
dataset and running the risk of becoming overwhelmed, 
you separate it into easier to digest chunks 
and study them individually and examine how 
they relate to other parts. And that’s analysis 
in a nutshell. 
One important thing to remember, however, 
is that you perform analyses on things that 
have already happened in the past. Such as


In [8]:
embedding = TogetherEmbeddings(
    model="BAAI/bge-base-en-v1.5",
    api_key=os.getenv("TOGETHER_API_KEY")
)

In [9]:
# Insert documents (Create) using Pinecone client directly

ids = []
for i, chunk in enumerate(chunks):
    # Get embedding vector for the chunk
    embedding_vector = embedding.embed_documents([chunk])[0]
    # Create a unique ID for each chunk
    doc_id = f"20250815-{i}"
    # Prepare metadata
    metadata = {"text":chunk,"source": filepath, "chunk_index": i}
    # Upsert into Pinecone
    index.upsert([
        {
            "id": doc_id,
            "values": embedding_vector,
            "metadata": metadata
        }
    ])
    ids.append(doc_id)
    print(f"Stored chunk {i+1}/{len(chunks)}: {doc_id}")

Stored chunk 1/10: 20250815-0
Stored chunk 2/10: 20250815-1
Stored chunk 3/10: 20250815-2
Stored chunk 4/10: 20250815-3
Stored chunk 5/10: 20250815-4
Stored chunk 6/10: 20250815-5
Stored chunk 7/10: 20250815-6
Stored chunk 8/10: 20250815-7
Stored chunk 9/10: 20250815-8
Stored chunk 10/10: 20250815-9


In [10]:
# Read (Retrieve) all documents using Pinecone client directly

# Get all vector IDs (you should have stored them in 'ids' during upsert)
# If you don't have the list, you can use index.describe_index_stats() to get all IDs
stats = index.describe_index_stats()
all_ids = []
for ns in stats['namespaces'].values():
    all_ids.extend(ns['vector_count'] * [None])  # Pinecone does not return IDs here, so you should track them during upsert

# If you have the IDs list from upsert:
if ids:
    response = index.fetch(ids)
    vectors = response['vectors']
    if vectors:
        first_id = list(vectors.keys())[0]
        first_vector = vectors[first_id]
        print("First document ID:", first_id)
        print("Metadata:", first_vector.get("metadata"))
        print("Embedding:", first_vector.get("values"))
    else:
        print("No documents found.")
else:
    print("No documents found.")

First document ID: 20250815-7
Metadata: {'chunk_index': 7.0, 'source': 'D:\\Python_01\\Python_01\\Langchain01\\Introduction_to_Data_and_Data_Science.pdf', 'text': 'useful, in fact, just the opposite—they \nare a lot easier to learn and be adopted by \nothers. You have already heard of several \nof those. \nBecause of its ability to do relatively complex \ncomputations and good visualizations quickly, \nExcel is a tool applicable to more than one category—traditional data, BI, and Data \nScience. Similarly, SPSS is a very famous \ntool for working with traditional data and \napplying statistical analysis. \nAmong the many applications we have plotted, \nwe can say there is an increasing amount of \nsoftware designed for working with big data \nsuch as Apache Hadoop, Apache Hbase, and Mongo \nDB. \nIn terms of big data, Hadoop is the name that \nmust stick with you. Hadoop is listed as a \nsoftware in the sense that it is a collection \nof programs, but don’t imagine it as a nice-looking

In [25]:
# Update an existing record in Pinecone DB using the Pinecone client

# Suppose you want to update the record with ID 'doc-0'
update_id = "doc-0"
updated_content = "Updated content for Pinecone."
updated_metadata = {"source": filepath, "chunk_index": 0, "updated": True}

# Get new embedding for the updated content
updated_embedding = embedding.embed_documents([updated_content])[0]

# Upsert the updated vector (same ID, new values/metadata)
index.upsert([
    {
        "id": update_id,
        "values": updated_embedding,
        "metadata": updated_metadata
    }
])
print(f"Document {update_id} updated in Pinecone")

Document doc-0 updated in Pinecone


In [11]:
ids = ["doc-0", "doc-1"]  # Example IDs to fetch
response = index.fetch(ids)
print(response)

{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'doc-0': {'id': 'doc-0',
                       'metadata': {'chunk_index': 0.0,
                                    'source': 'D:\\Python_01\\Python_01\\Langchain01\\Introduction_to_Data_and_Data_Science.pdf',
                                    'text': 'Analysis vs Analytics \n'
                                            'Alright! So… \n'
                                            'Let’s discuss the not-so-obvious '
                                            'differences \n'
                                            'between the terms analysis and '
                                            'analytics. \n'
                                            'Due to the similarity of the '
                                            'words, some people \n'
                                            'believe they share the same '
                                            'meaning, and thus \n'
                               

In [12]:
# Similarity search in Pinecone DB

query_text = "programming languages used in data science"  # Your search query
query_embedding = embedding.embed_documents([query_text])[0]

# Query Pinecone for top-k similar vectors
top_k = 3
search_response = index.query(
    vector=query_embedding,
    top_k=top_k,
    include_metadata=True
)

# Print results
for match in search_response['matches']:
    print(f"ID: {match['id']}")
    print(f"Score: {match['score']}")
    print(f"Metadata: {match.get('metadata')}")
    print("---")

ID: chunk-4
Score: 0.783504307
Metadata: {'text': 'can have their unique meanings too. \nMore of this will be explained in the next \nvideo which aims to simplify these, as well as many more with a fantastic diagram. So, \nlet’s move on! \nProgramming Languages & Software Employed in Data Science - All the Tools You \nNeed \nAlright! So… \nHow are the techniques used in data, business \nintelligence, or predictive analytics applied \nin real life? \nCertainly, with the help of computers. \nYou can basically split the relevant tools \ninto two categories—programming languages \nand software. \nKnowing a programming language enables you \nto devise programs that can execute specific \noperations. Moreover, you can reuse these \nprograms whenever you need to execute the \nsame action. \nAs you can see from the infographic, R, and \nPython are the two most popular tools across \nall columns. Their biggest advantage is that \nthey can manipulate data and are integrated \nwithin multiple dat