# Pinecone

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [1]:
pip install -q pinecone

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install --upgrade -q pinecone

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip show pinecone

Name: pinecone
Version: 6.0.1
Summary: Pinecone client and SDK
Home-page: https://www.pinecone.io
Author: Pinecone Systems, Inc.
Author-email: support@pinecone.io
License: Apache-2.0
Location: /Users/pandagan/workspace/projects/ztm_llm_langchain/corso_langchain/lib/python3.13/site-packages
Requires: certifi, pinecone-plugin-interface, python-dateutil, typing-extensions, urllib3
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [4]:
from pinecone import Pinecone
pc = Pinecone()

pc.list_indexes()

[]

## Working With pinecone Indexes

## Creare un indice

In [7]:
from pinecone import ServerlessSpec
index_name = 'langchain'
if index_name not in pc.list_indexes().names():
    print(f'Craeating index: {index_name}')
    pc.create_index(
        name=index_name,
        dimension=1536, # Replace with your model dimensions
        metric="cosine", # Replace with your model metric
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print('Index Created! 🤓')
else:
    print(f'Index {index_name} already exists!')

Craeating index: langchain
Index Created! 🤓


## Cancellare un index

In [33]:
index_name = 'langchain'
if index_name in pc.list_indexes().names():
    print(f'Deleting index {index_name} ...')
    pc.delete_index(index_name)
    print('Done')
else:
    print(f'Index {index_name} does not exists!')

Deleting index langchain ...
Done


## Operazioni con un indice
Prima di fare operazioni con un indice dobbiamo selezionarlo

In [8]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

## Working with Vectors
### Inserting Vectors (random vectors)

In [9]:
import random
#Creo 5 vettori randomici
vectors = [[random.random() for _ in range(1536)] for v in range(5)]
print(vectors)


[[0.6593800154305917, 0.9647029359135605, 0.025482687628644585, 0.3090109748766994, 0.032615225566080985, 0.9847807557984948, 0.23785587692867494, 0.48271844290103194, 0.8052471437166984, 0.788536844533886, 0.8121580670524675, 0.10986607370858814, 0.263335854822218, 0.9569382479759477, 0.5547805434120243, 0.26102985121286637, 0.05040559808825407, 0.80629596150934, 0.5769060419853533, 0.8952749510387098, 0.5012584533106688, 0.7454171297094879, 0.03898776864904485, 0.4058256151568438, 0.3917992605033904, 0.9420280650186718, 0.4195057579644811, 0.23277331347250108, 0.14904215819146727, 0.31332937040829467, 0.9059535958099154, 0.5130840766758481, 0.6825462218225432, 0.41879284223268654, 0.03508721408832027, 0.18100062285335872, 0.9812917325451161, 0.678892485759447, 0.9426833378666132, 0.03855581291683241, 0.6728894474195843, 0.8668783000142922, 0.9970860093792369, 0.6628393736157154, 0.2485482949946658, 0.656949121035137, 0.6532499541920934, 0.6902234082515794, 0.3170734317055254, 0.27643

In [10]:
# Per inserire i vettori nell'indice, ogni vettore ha bisogno di un id. Al momento creiamo a mano gli id.
ids = list('abcde')

In [11]:
# Inserire i vettori nell'indice
# selezioniamo l'indice
index_name = 'langchain'
index = pc.Index(index_name)

# per inserirli usiamo `upsert`, si usa per inserire nuovi valori o aggiornare gli esistenti
index.upsert(vectors=zip(ids,vectors # zip crea delle tuple, con id e vettore

{'upserted_count': 5}

### Updating Vectors

Udating vector with id c

In [13]:
index.upsert(vectors=[('c',[0.5]* 1536)])

{'upserted_count': 1}

### Fetching a vectors by id

In [14]:

# index = pc.Index(index_name)

index.fetch(ids=['c','d'])

FetchResponse(namespace='', vectors={'c': Vector(id='c', values=[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 

### Deleting a vectors by id

In [20]:
index.delete(ids=['b','c'])

{}

In [15]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5,
 'vector_type': 'dense'}

In [16]:
index.fetch(ids=['x'])

FetchResponse(namespace='', vectors={}, usage={'read_units': 1})

## Create a Manual Random Query

In [17]:
query_vector = [random.random() for _ in range(1536)]

### Recuperà i 3 vettori con indice di similarità più alto

In [18]:
index.query(
    vector = query_vector,
    top_k = 3,
    include_values=False
)

{'matches': [{'id': 'c', 'score': 0.873009086, 'values': []},
             {'id': 'a', 'score': 0.763750136, 'values': []},
             {'id': 'b', 'score': 0.760002792, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

## Namespace
Pinecone ti consente di partizionare i vettori in un indice in **Namespaces**.

Le **query** e altre operazioni sono **limitate a uno namespace specifico**, consentendo a diverse richieste di cercare diversi sottoinsiemi del tuo indice.

Informazioni chiave su Namespace:

- Ogni indice è costituito da uno o più Namespace.
- Ogni vettore esiste esattamente in un Namespace.
- I Namespace sono identificati in modo univoco da un nome di Namespace.
- Il Namespace predefinito è rappresentato dalla stringa vuota e viene utilizzato se non viene specificato alcun Namespace specifico.

In [19]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5,
 'vector_type': 'dense'}

### In questo caso vengono inseriti nel name space di default

In [36]:
index = pc.Index('langchain')

import random
vectors = [[random.random() for _ in range(1536)] for v in range(5)]
ids = list('abcde')
index.upsert(vectors=zip(ids,vectors))



{'upserted_count': 5}

### creo 3 nuovi vettori a un namespace specifico

In [23]:
vectors = [[random.random() for _ in range(1536)] for v in range(3)]
ids = list('xyz')
index.upsert(vectors=zip(ids,vectors), namespace='first-namespace')


{'upserted_count': 3}

In [22]:
vectors = [[random.random() for _ in range(1536)] for v in range(2)]
ids = list('qp')
index.upsert(vectors=zip(ids,vectors), namespace='second-namespace')

{'upserted_count': 2}

Tutte le operazioni, esclusa `describe_index_stats()` si applicano a un namespace specifico.

In [24]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 5},
                'first-namespace': {'vector_count': 3},
                'second-namespace': {'vector_count': 2}},
 'total_vector_count': 10,
 'vector_type': 'dense'}

In [25]:
index.fetch(ids=['x'], namespace='first-namespace')

FetchResponse(namespace='first-namespace', vectors={'x': Vector(id='x', values=[0.398842156, 0.486623377, 0.802444816, 0.3589935, 0.625276804, 0.3584086, 0.170400411, 0.859950304, 0.390006065, 0.962989867, 0.773729682, 0.156302661, 0.819633901, 0.805999517, 0.532503, 0.404980123, 0.997192085, 0.962293565, 0.944066107, 0.425263137, 0.205578342, 0.617420852, 0.126189455, 0.174970105, 0.92506808, 0.945442438, 0.640235722, 0.515816689, 0.70458, 0.488399714, 0.404396653, 0.252798378, 0.958925903, 0.74387145, 0.213324189, 0.883281529, 0.928839505, 0.346526712, 0.891748, 0.702137887, 0.457667321, 0.535245, 0.628094792, 0.261259079, 0.156954601, 0.100308679, 0.302984595, 0.834389508, 0.0619328059, 0.771077633, 0.224054575, 0.686995, 0.590371072, 0.413287699, 0.857015491, 0.146177545, 0.393149346, 0.254387915, 0.844179094, 0.248300567, 0.152530611, 0.70205, 0.414478689, 0.0620189048, 0.181912035, 0.770198524, 0.377956092, 0.540219724, 0.147150457, 0.248915657, 0.902676702, 0.343211234, 0.051596

In [26]:
index.delete(ids=['x'], namespace='first-namespace')

{}

In [27]:
index.fetch(ids=['x'], namespace='first-namespace')

FetchResponse(namespace='first-namespace', vectors={}, usage={'read_units': 1})

### cancellare tutti i record di un namespace, e 🚨cancella anche il namespace

In [28]:
index.delete(delete_all=True, namespace='first-namespace')

{}

In [29]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 5},
                'second-namespace': {'vector_count': 2}},
 'total_vector_count': 7,
 'vector_type': 'dense'}

# Splitting and Embeddign Text Using LangChain

In [31]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

Caricheremo il testo da una repo, ma potremmo caricarlo da qualsiasi tipo di file o sorgente, usando **Transform Loaders**, ci sono service loader su pubblici che privati.

Uno dei **TextSplitter** più usati è `RecursiveCharacterTextSplitter`, di default i caratteri su cui prova a fare lo split sono:
* Double `\n`
* `\n`
* `white space`

In [42]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open('files/churchill_speech.txt') as f:
    churchill_speech = f.read()
    
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100, # solitamente qui si usa un valore alto, e poi si fanno small chunck
    chunk_overlap=20, # max overlap per garantire continuità( gli ultimi 20 caratteri del primo blocco saranno i primi 20 del secondo blocco)
    length_function=len
)

In [43]:
chunks = text_splitter.create_documents([churchill_speech])

In [45]:
print(chunks[10])

page_content='penetration were realized and when a new French Generalissimo, General Weygand, assumed'


In [47]:
print(chunks[3].page_content)

second week of May, only a rapid retreat to Amiens and the south could have saved the British and


In [49]:
print(f'Now i have {len(chunks)}')

Now i have 300


#### Embedding Cost
Useremo **OpeanAI's text embedding ADA002** che ha un costo, quindi conviene calcolare prima il costo e useremo **tiktoken**

In [51]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embdedding Cost in USD: {total_tokens/ 1000  * 0.0004:-6f}')

print_embedding_cost(chunks)

Total Tokens: 4820
Embdedding Cost in USD: 0.001928


In [73]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

esempio facciamo embedding della stringa 'abc'

In [74]:
vector_test = embedding.embed_query('abc')
vector_test

[0.002558193900240056,
 -0.01136503289795203,
 -0.009624187024430351,
 -0.03909118150226145,
 -0.03444892956482732,
 0.012016080471193284,
 -0.021371355549397755,
 -0.02285744464868338,
 0.01866809115969756,
 -0.0003956266204819845,
 0.00334015888642744,
 0.01920591505214932,
 -0.002738647197400543,
 -0.004521952533294896,
 -0.01869639777838687,
 0.0029969434631259926,
 0.027032642034356784,
 0.010600758849953526,
 0.010820133747811817,
 0.007487051283226059,
 -0.014521199461390879,
 0.017833052427620944,
 -0.0070058420251368,
 -0.015695916686416653,
 -0.020239098252405947,
 -0.003619685116169874,
 0.009978016964078998,
 -0.020026801406203863,
 0.026268368917680866,
 -0.007345519121102083,
 0.007812575535507979,
 0.013799385341426343,
 -0.007494127937898387,
 -0.00994263415637865,
 -0.01043799718947386,
 -0.014280595065176895,
 -0.013700313107336336,
 -0.015964826769997362,
 0.01039553726143989,
 -0.00026537280398245455,
 0.0243576842633459,
 0.004546720591817398,
 0.014061220167318604

In [67]:
vector = embedding.embed_query(chunks[0].page_content)
print(vector)

[-0.04456397749355405, -0.0378589884465157, -0.0029398076103669276, -0.00800519996152611, 0.015831939911896683, 0.022600668668586867, -0.028528085411930963, -0.00966232717303508, 0.0010850997553752614, 0.0073168548694422285, 0.007782124947581689, 0.032811120091729054, 0.007374216882839013, -0.011714615438678825, 0.0063257652486226985, -0.005446212669114145, 0.013167788617816457, -0.002444262799957162, 0.013550202661343333, -0.010956161322590259, -0.00813267099892758, -0.026845463247883727, 0.029649833521295788, -0.003843260718349985, -0.014455249727728919, -0.01849609027465444, 0.010917920290766558, -0.01858532028023221, 0.003013103619854204, -0.014289536727181282, 0.0070682853686207725, -0.008546953034635438, -0.016533032014588465, 0.00516258912195158, -0.018330378205429272, -0.023837138942100326, -0.02233297865185356, -0.008789149030152944, 0.022677150732234266, -0.012670650547496012, 0.0136139386457053, 0.00460490205275176, 0.00878277505918776, 0.0029844226131558117, -0.027916223314

## Inserting the Embeddigns into A pinecone Index

In [76]:
pip install langchain-pinecone


Collecting langchain-pinecone
  Downloading langchain_pinecone-0.2.4-py3-none-any.whl.metadata (1.3 kB)
Collecting aiohttp<3.11,>=3.10 (from langchain-pinecone)
  Downloading aiohttp-3.10.11-cp313-cp313-macosx_11_0_arm64.whl.metadata (7.7 kB)
Collecting langchain-tests<1.0.0,>=0.3.7 (from langchain-pinecone)
  Downloading langchain_tests-0.3.17-py3-none-any.whl.metadata (3.1 kB)
Collecting pytest<9,>=7 (from langchain-tests<1.0.0,>=0.3.7->langchain-pinecone)
  Downloading pytest-8.3.5-py3-none-any.whl.metadata (7.6 kB)
Collecting pytest-asyncio<1,>=0.20 (from langchain-tests<1.0.0,>=0.3.7->langchain-pinecone)
  Downloading pytest_asyncio-0.26.0-py3-none-any.whl.metadata (4.0 kB)
Collecting syrupy<5,>=4 (from langchain-tests<1.0.0,>=0.3.7->langchain-pinecone)
  Downloading syrupy-4.9.1-py3-none-any.whl.metadata (38 kB)
Collecting pytest-socket<1,>=0.6.0 (from langchain-tests<1.0.0,>=0.3.7->langchain-pinecone)
  Downloading pytest_socket-0.7.0-py3-none-any.whl.metadata (6.7 kB)
Collectin

In [77]:
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
import pinecone


pc = pinecone.Pinecone()



In [60]:
# On free plan i need to delete the existing 
for i in pc.list_indexes().names():
    print('Deleting all indexes...',end="")
    pc.delete_index(i)
    print('Done')

Deleting all indexes...Done


In [84]:
index_name = 'churchill-speech'

# Ottieni la lista degli indici
existing_indexes = pc.list_indexes()
print("Indici esistenti:", existing_indexes)

# Verifica se l'indice esiste già nella lista degli indici
if index_name not in [index.name for index in existing_indexes]:
    print(f'Creating Index {index_name}...')
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=pinecone.ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
    print('Done')
    # Aspetta che l'indice sia pronto
    import time
    time.sleep(60)
else:
    print(f'Index {index_name} already exists, skipping creation.')

Indici esistenti: [{
    "name": "churchill-speech",
    "metric": "cosine",
    "host": "churchill-speech-i8se1nd.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1536,
    "deletion_protection": "disabled",
    "tags": null
}]
Index churchill-speech already exists, skipping creation.


In [85]:
# Accedi all'indice esistente
index = pc.Index(index_name)

In [86]:
print(index)

<pinecone.data.index.Index object at 0x11b7eb680>


In [89]:
vector_store = PineconeVectorStore.from_documents(
    documents=chunks, 
    embedding=embeddings, 
    index_name=index_name  # Usa index_name invece di index
)

In [71]:
print(pinecone.__version__)

6.0.1


In [90]:
# Ottieni prima l'oggetto index
index = pc.Index("churchill-speech")

# Poi crea il vector store
vector_store = PineconeVectorStore(
    index=index,
    embedding=embeddings,
    text_key="text"
)

## Asking Questions (Similarity Search)

In [94]:
query = 'Where should we fight?'
matching_docs = vector_store.similarity_search(query)

print(matching_docs)

[Document(id='0fe5fc01-9efe-46d9-8440-abc6707827be', metadata={}, page_content='shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and'), Document(id='2eba45a8-cb6c-4f2d-ac07-6c60f5f5d427', metadata={}, page_content='front, now on that, fighting'), Document(id='2468745f-7611-4e01-886d-268b3dbdf93d', metadata={}, page_content='end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing'), Document(id='ff0ed3c1-d511-4d54-92ff-a588ac5ec193', metadata={}, page_content='When we consider how much greater would be our advantage in defending the air above this Island')]


In [96]:
for r in matching_docs:
    print(r.page_content)
    print('-' * 50 )

shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
--------------------------------------------------
front, now on that, fighting
--------------------------------------------------
end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
--------------------------------------------------
When we consider how much greater would be our advantage in defending the air above this Island
--------------------------------------------------


###  Integro con un Retriever di LangChain


In [99]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})

chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

In [101]:
query = 'Where should we fight?'
answer = chain.invoke(query)
print(answer)

{'query': 'Where should we fight?', 'result': 'Based on the provided context, it seems that the question is related to a speech by Winston Churchill during World War II. In the speech, he mentions fighting on beaches, landing grounds, fields, fronts, in France, on seas, and oceans. So, the answer would be that we should fight in various locations both on land and at sea.'}


Se chiediamo una domanda due volte la risposta sarà diversa.