In [1]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='.env.local')
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import os
import json

  from tqdm.autonotebook import tqdm


In [6]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [None]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

In [9]:
processed_data = []
client = OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input=review["review"],
        model="text-embedding-3-small",
    )
    embedding=response.data[0].embedding
    processed_data.append({
        "values":embedding,
        "id": review["professor"],
        "metadata":{
            "review": review["review"],
            "subject":review["subject"],
            "stars": review["stars"]
        }
    })


In [10]:
processed_data[0]

{'values': [-0.0375732,
  0.023250125,
  -0.012495388,
  0.010611756,
  0.030859258,
  -0.00048334152,
  0.010387957,
  -0.011998059,
  0.012091309,
  0.0048738234,
  0.028347747,
  0.023635555,
  -0.02098728,
  -0.034166496,
  0.034564357,
  -0.012538905,
  -0.005644683,
  -0.044859067,
  -0.016225355,
  0.040656637,
  0.0017095681,
  -0.024095586,
  0.012893252,
  0.023088494,
  -0.023324726,
  -0.04279515,
  0.013315981,
  0.003661584,
  0.03739913,
  0.04834037,
  0.0332713,
  -0.0057627987,
  0.020726182,
  0.0015634777,
  -0.024704812,
  0.052617397,
  -0.019296361,
  0.06420516,
  0.029914333,
  0.014410105,
  -0.011643712,
  -0.0053649354,
  -0.019545026,
  -0.014534437,
  0.036678005,
  -0.017406512,
  -0.026035167,
  -0.029640803,
  0.0376478,
  0.016349688,
  -0.041054502,
  0.050056152,
  0.05589977,
  0.016847016,
  -0.007932396,
  -0.014385238,
  -0.022280335,
  0.029218072,
  -0.0063409433,
  -0.027179025,
  0.030759793,
  -0.0071242363,
  0.031853914,
  -0.028099082,
  

In [11]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [12]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}

In [None]:
import os
print(os.getenv("PINECONE_API_KEY"))

In [None]:
print(os.getenv("OPENAI_API_KEY"))