In the Last notebook we created the embeddings and saved them in the squad_embedded_data.pkl

In this we will upsert the data into pinecone and use it from there

In [2]:
with open('./squad_embedded_data.pkl', 'rb') as fp:
    import pickle
    data = pickle.load(fp)
 

In [3]:
data[0].keys()

dict_keys(['id', 'context', 'metadata', 'vector'])

In [14]:
from pinecone import Pinecone, ServerlessSpec
import os

pc = Pinecone(
    api_key = os.getenv("PINECONE_API_KEY")
)

In [15]:

pc.list_indexes()


[]

In [16]:
index_name = "squad-data"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="euclidean",
        spec=ServerlessSpec(
                    cloud='aws', 
                    region="us-east-1"
                )
    )


In [18]:
index = pc.Index('squad-data')


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
data[0].keys()

dict_keys(['id', 'context', 'metadata', 'vector'])

In [27]:
from tqdm.auto import tqdm  # for progress bar

# now we can upsert the data in batches of 100
batch_size = 100

for i in tqdm(range(0, len(data), batch_size)):
    i_end = i + batch_size
    if i_end > len(data):
        i_end = len(data)
    batch = data[i: i_end]

    # Formatted: List of tuples (id, vector, metadata)
    formatted_batch = [
        (str(item['id']), item['vector'], item['metadata']) 
        for item in batch
    ]
    
    index.upsert(vectors=formatted_batch)

  0%|          | 0/182 [00:00<?, ?it/s]

100%|██████████| 182/182 [03:22<00:00,  1.11s/it]


In [42]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('stsb-xlm-r-multilingual')

xq = embedder.encode("Early engineering courses provided by American Universities")
xq

array([-1.89433441e-01,  1.88777819e-01,  2.68159598e-01, -3.77027661e-01,
        4.22352582e-01, -5.42530119e-01,  1.63300648e-01,  5.18856645e-02,
        1.77632079e-01, -5.54475605e-01,  1.95561886e-01,  7.67359197e-01,
        2.71132231e-01,  7.38279521e-01, -5.12811601e-01, -9.92903888e-01,
       -6.98723495e-01, -3.61611694e-02, -3.00773978e-01,  7.98025727e-02,
        2.40256503e-01, -3.44386935e-01,  7.12338626e-01, -2.74166048e-01,
        4.31985594e-02, -5.58937490e-01,  1.43299356e-01, -1.85564712e-01,
       -9.95036364e-01,  1.82158291e-01, -1.09506495e-01, -7.54052550e-02,
       -6.79556653e-03, -5.70445895e-01, -8.26943159e-01,  5.03625631e-01,
       -1.72525287e-01,  3.22688729e-01,  1.52988121e-01,  1.70277536e-01,
       -1.32232869e+00, -7.42632806e-01, -9.66380257e-03,  3.91815305e-01,
       -1.46606505e+00, -7.36996830e-01, -5.23200393e-01,  3.12918156e-01,
       -1.72679856e-01, -1.03489034e-01, -3.49461555e-01,  4.68048781e-01,
        4.86889929e-02, -

In [44]:
xq.shape

(768,)

In [47]:
results = index.query( vector=xq.tolist(), top_k=5)

In [49]:
print(results)

QueryResponse(matches=[{'id': '572745c6708984140094db99', 'score': 177.582336, 'values': []}, {'id': '572745c6708984140094db9d', 'score': 177.582336, 'values': []}, {'id': '572745c6708984140094db9b', 'score': 177.582336, 'values': []}, {'id': '57286ec63acd2414000df9d2', 'score': 196.281036, 'values': []}, {'id': '57286ec63acd2414000df9d1', 'score': 196.281036, 'values': []}], namespace='', usage={'read_units': 1}, _response_info={'raw_headers': {'date': 'Sun, 18 Jan 2026 09:23:11 GMT', 'content-type': 'application/json', 'content-length': '390', 'connection': 'keep-alive', 'x-pinecone-max-indexed-lsn': '182', 'x-pinecone-request-latency-ms': '313', 'x-pinecone-request-id': '6319553510469251287', 'x-envoy-upstream-service-time': '97', 'x-pinecone-response-duration-ms': '315', 'grpc-status': '0', 'server': 'envoy'}})


In [55]:
ids = [ match['id'] for match in results['matches'] ]
ids


['572745c6708984140094db99',
 '572745c6708984140094db9d',
 '572745c6708984140094db9b',
 '57286ec63acd2414000df9d2',
 '57286ec63acd2414000df9d1']

In [56]:
get_sample = {
    x['id']: {
        'context': x['context'],
        'metadata': x['metadata']
    } for x in data}

In [57]:
for id in ids:
    print(get_sample[id])

{'context': "Nel moderno mondo industrializzato, la costruzione implica solitamente la traduzione dei progetti in realtà. Un team formale di progettazione può essere costituito per pianificare il procedimento fisico e integrarlo con le altre parti. La progettazione è generalmente costituita da disegni e specifiche, solitamente preparati da un team di progettazione che comprende architetti, ingegneri civili, ingegneri meccanici, ingegneri meccanici, ingegneri elettrici, ingegneri strutturali, ingegneri di protezione antincendio, consulenti di progettazione, consulenti architettonici e consulenti archeologici. Il team di progettazione è impiegato più comunemente da (i. e. in contratto con) il proprietario. Nell' ambito di questo sistema, una volta che il progetto è stato ultimato dal team di progettazione, alcune imprese di costruzione o società di gestione delle costruzioni possono essere invitate a presentare un' offerta per il lavoro, sia direttamente sulla base del progetto, sia sull

In [63]:
results = index.query( vector=xq.tolist(), top_k=3,
                        filter = { 'lang' : { '$eq' : 'en' } } )
results

QueryResponse(matches=[{'id': '572745c6708984140094db9c', 'score': 206.579712, 'values': []}, {'id': '572745c6708984140094db9a', 'score': 206.579712, 'values': []}, {'id': '57286ec63acd2414000df9d3', 'score': 208.837463, 'values': []}], namespace='', usage={'read_units': 1}, _response_info={'raw_headers': {'date': 'Sun, 18 Jan 2026 09:33:37 GMT', 'content-type': 'application/json', 'content-length': '260', 'connection': 'keep-alive', 'x-pinecone-max-indexed-lsn': '182', 'x-pinecone-request-latency-ms': '8', 'x-pinecone-request-id': '1595824334201790145', 'x-envoy-upstream-service-time': '7', 'x-pinecone-response-duration-ms': '9', 'grpc-status': '0', 'server': 'envoy'}})

In [64]:
for match in results['matches']:
    id = match['id']
    print(get_sample[id])

{'context': 'In the modern industrialized world, construction usually involves the translation of designs into reality. A formal design team may be assembled to plan the physical proceedings, and to integrate those proceedings with the other parts. The design usually consists of drawings and specifications, usually prepared by a design team including Architect, civil engineers, mechanical engineers, electrical engineers, structural engineers, fire protection engineers, planning consultants, architectural consultants, and archaeological consultants. The design team is most commonly employed by (i.e. in contract with) the property owner. Under this system, once the design is completed by the design team, a number of construction companies or construction management companies may then be asked to make a bid for the work, either based directly on the design, or on the basis of drawings and a bill of quantities provided by a quantity surveyor. Following evaluation of bids, the owner typical

In [65]:
conditions = {
    'lang': {'$eq': 'en'},
    'title': {'$nin': ['University_of_Chicago', 'University_of_Notre_Dame']}
}

results = index.query( vector=xq.tolist(), top_k=5, filter=conditions )
results

QueryResponse(matches=[{'id': '572745c6708984140094db9c', 'score': 206.579712, 'values': []}, {'id': '572745c6708984140094db9a', 'score': 206.579712, 'values': []}, {'id': '572926d23f37b31900478084', 'score': 218.823425, 'values': []}, {'id': '572926d23f37b31900478085', 'score': 218.823425, 'values': []}, {'id': '572926d23f37b31900478086', 'score': 218.823425, 'values': []}], namespace='', usage={'read_units': 1}, _response_info={'raw_headers': {'date': 'Sun, 18 Jan 2026 09:35:14 GMT', 'content-type': 'application/json', 'content-length': '390', 'connection': 'keep-alive', 'x-pinecone-max-indexed-lsn': '182', 'x-pinecone-request-latency-ms': '248', 'x-pinecone-request-id': '125426394070456890', 'x-envoy-upstream-service-time': '38', 'x-pinecone-response-duration-ms': '250', 'grpc-status': '0', 'server': 'envoy'}})

In [66]:
for match in results['matches']:
    id = match['id']
    print(get_sample[id])

{'context': 'In the modern industrialized world, construction usually involves the translation of designs into reality. A formal design team may be assembled to plan the physical proceedings, and to integrate those proceedings with the other parts. The design usually consists of drawings and specifications, usually prepared by a design team including Architect, civil engineers, mechanical engineers, electrical engineers, structural engineers, fire protection engineers, planning consultants, architectural consultants, and archaeological consultants. The design team is most commonly employed by (i.e. in contract with) the property owner. Under this system, once the design is completed by the design team, a number of construction companies or construction management companies may then be asked to make a bid for the work, either based directly on the design, or on the basis of drawings and a bill of quantities provided by a quantity surveyor. Following evaluation of bids, the owner typical