In [None]:
import pinecone
import pandas as pd
from dotenv import load_toenv
from pinecone import ServerlessSpec
from pinecone import Pinecone
import os
load_toenv()

api_key = os.getenv('PINECONE_API_KEY')

# configure client
pc = Pinecone(api_key=api_key)

cloud = os.getenv('PINECONE_CLOUD')
region = os.getenv('PINECONE_REGION')

spec = ServerlessSpec(cloud=cloud, region=region)

df = pd.read_csv("../Artifacts/data_preparation/_cleaned_data.csv")



In [None]:
import time

index_name = 'semantic-search-product'

existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=384,  # dimensionality of minilm
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

In [None]:
from tqdm.auto import tqdm

for batch in tqdm(dataset.iter_documents(batch_size=500), total=160):
    index.upsert(batch)

In [None]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
model

In [None]:
query = "which city has the highest population in the world?"

# create the query vector
xq = model.encode(query).tolist()

# now query
xc = index.query(vector=xq, top_k=5, include_metadata=True)
xc

In [None]:
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")