In [25]:
from dotenv import load_dotenv
load_dotenv()
import os 
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec



In [26]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [27]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Introduction to Psychology',
  'stars': 4,
  'review': 'Dr. Johnson is an excellent professor who really engages her students. Her lectures are always interesting and she is always available to help during office hours.'},
 {'professor': 'Dr. Samantha Chen',
  'subject': 'Organic Chemistry',
  'stars': 5,
  'review': "Dr. Chen is the best professor I've had. Her teaching style is clear and engaging, and she is always willing to help students who are struggling. I would highly recommend taking any of her classes."},
 {'professor': 'Professor Michael Williams',
  'subject': 'Intro to Computer Science',
  'stars': 2,
  'review': "I did not enjoy Professor Williams' class at all. The lectures were boring and he didn't seem to care much about whether the students were learning the material. I would not take another class with him."},
 {'professor': 'Dr. Olivia Davis',
  'subject': 'Developmental Psychology',
  'stars': 4,
  'review': 'Dr. Da

In [28]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })



In [29]:
processed_data[0]

{'values': [0.0026059756,
  -0.002619093,
  0.04346791,
  0.042348564,
  0.002315937,
  -0.00760222,
  0.030921914,
  0.047548853,
  -0.00910051,
  0.022258647,
  0.024042604,
  -0.0038069398,
  -0.028823141,
  0.027190762,
  0.003343461,
  0.037568025,
  -0.021652335,
  -0.0022795,
  0.023447951,
  0.045846518,
  0.013980156,
  -0.016883457,
  0.020486351,
  -0.035469253,
  -0.02282998,
  -0.029849207,
  0.0071241665,
  0.007357363,
  0.039783396,
  -0.0063429563,
  0.08646942,
  0.0053518694,
  -0.024718875,
  -0.012021302,
  -0.02050967,
  0.01902887,
  0.02810023,
  -0.0018174786,
  0.014259992,
  0.00076954986,
  -0.012627614,
  0.0065120244,
  -0.006686922,
  0.019635182,
  0.031458266,
  0.005611301,
  0.017104995,
  -0.019262066,
  0.04414418,
  0.061424073,
  -0.02961601,
  -0.015612534,
  0.03206458,
  -0.03320724,
  -0.029289534,
  0.0011842032,
  0.014388251,
  0.021943832,
  0.020672908,
  -0.021046024,
  0.038943887,
  -0.004626044,
  -0.022969898,
  -0.01668524,
  -0.023

In [30]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [31]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}