In [4]:
from dotenv import load_dotenv
load_dotenv()
import os
from pinecone import Pinecone, ServerlessSpec
import json
import requests

In [5]:
# NOTE: requires .env instead of .env.local (for JS)

# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

#Create a Pinecone index
# pc.create_index(
#     name="rag",
#     dimension=768,
#     metric="cosine",
#     spec=ServerlessSpec(cloud="aws", region="us-east-1"),
# )

In [6]:
data = json.load(open("reviews.json"))
processed_data = []

# https://huggingface.co/blog/getting-started-with-embeddings
# Create embeddings for each review
# Note: using retry decorator (pip install retry) in query function
model_id = "sentence-transformers/all-mpnet-base-v2"
hf_token = os.getenv("HUGGINGFACE_API_TOKEN")
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}
# reviews_list = [review["review"] for review in data["reviews"]]
def query(texts):
    for review in texts["reviews"]:
        response = requests.post(api_url, headers=headers, json={"inputs": review["review"], "options":{"wait_for_model":True}})
        processed_data.append(
        {
            "values": response.json(),
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )
    return processed_data

output = query(data)

# Insert the embeddings into the Pinecone index
index = pc.Index("rag")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)
print(f"Upserted count: {upsert_response['upserted_count']}")

# Print index statistics
print(index.describe_index_stats())

[{'values': [-0.005506929010152817, -0.004807983059436083, -0.02603781782090664, 0.012516419403254986, 0.02730002999305725, -0.00029642379377037287, 0.014805442653596401, -0.02301289141178131, -0.04931323230266571, 0.012910934165120125, 0.027539579197764397, 0.011228007264435291, -0.030681122094392776, 3.715023922268301e-05, 0.030315924435853958, -0.07541798800230026, 0.0009684403194114566, 0.029103226959705353, -0.05184657871723175, -0.037108831107616425, 0.031861238181591034, 0.0028811204247176647, -0.06827324628829956, -0.02042446658015251, 0.0010150085436180234, -0.01616564206779003, -0.031405240297317505, 0.04318040981888771, 0.01742667146027088, 0.015946922823786736, 0.027018671855330467, -0.007115136366337538, -0.0676179826259613, 0.06987151503562927, 1.7835059225035366e-06, -0.0251478124409914, -0.057139039039611816, -0.022497154772281647, -0.012911091558635235, 0.042085181921720505, 0.04111224785447121, 0.030888019129633904, -0.011650756001472473, -0.026809334754943848, 0.0041