In [2]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
import json
import requests

  from tqdm.autonotebook import tqdm


In [3]:
# Note: may require .env.local to be .env instead
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

#Create a Pinecone index
# pc.create_index(
#     name="rag",
#     dimension=768,
#     metric="cosine",
#     spec=ServerlessSpec(cloud="aws", region="us-east-1"),
# )

In [6]:
data = json.load(open("reviews.json"))
processed_data = []
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)

# https://huggingface.co/blog/getting-started-with-embeddings
# Create embeddings for each review
# Note: using retry decorator (pip install retry) in query function
model_id = "sentence-transformers/all-mpnet-base-v2"
hf_token = os.getenv("HF_TOKEN")
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}
# reviews_list = [review["review"] for review in data["reviews"]]
def query(texts):
    for review in texts["reviews"]:
        response = requests.post(api_url, headers=headers, json={"inputs": review["review"], "options":{"wait_for_model":True}})
        processed_data.append(
        {
            "values": response.json(),
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )
    return processed_data

output = query(data)

# Insert the embeddings into the Pinecone index
index = pc.Index("rag")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)
print(f"Upserted count: {upsert_response['upserted_count']}")

# Print index statistics
print(index.describe_index_stats())

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}
