In [12]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [None]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension = 1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [11]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. Jane Smith',
  'subject': 'Introduction to Psychology',
  'stars': 5,
  'review': 'Dr. Smith is an incredible professor! Her lectures are engaging and insightful. Highly recommend her classes.'},
 {'professor': 'Dr. Michael Johnson',
  'subject': 'Quantum Physics',
  'stars': 4,
  'review': 'Challenging course, but Dr. Johnson makes complex topics understandable. Exams are tough but fair.'},
 {'professor': 'Prof. Emily Davis',
  'subject': 'World History',
  'stars': 3,
  'review': 'Decent professor, but the lectures can be a bit dry. Requires a lot of reading.'},
 {'professor': 'Dr. William Brown',
  'subject': 'Organic Chemistry',
  'stars': 2,
  'review': 'Not the best teaching style. Material is difficult and not well explained.'},
 {'professor': 'Prof. Linda Wilson',
  'subject': 'Sociology 101',
  'stars': 5,
  'review': 'Prof. Wilson is amazing! Her passion for sociology is contagious. Learned a lot in her class.'},
 {'professor': 'Dr. Christopher Martinez',


In [13]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input = review['review'],
        model = "text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"],
        }
    })

In [14]:
processed_data[0]

{'values': [-0.008059855,
  -0.012544108,
  -0.05017643,
  0.027566355,
  0.0138893835,
  0.014892439,
  0.008018552,
  0.004543256,
  0.0018320533,
  -0.038210556,
  0.0055404124,
  -0.0047556683,
  0.004153834,
  0.006779482,
  0.018291032,
  0.015376267,
  -0.025206221,
  -0.03122456,
  0.024804998,
  0.051356494,
  0.022185251,
  -0.01525826,
  0.005018233,
  -0.023860946,
  -0.030138899,
  -0.037124895,
  0.030964946,
  0.049137972,
  0.015541476,
  0.021052387,
  0.075524256,
  0.002535668,
  -0.011936373,
  -0.020179138,
  -0.035756014,
  0.02377834,
  -0.02768436,
  0.03068173,
  0.015234659,
  -0.008791496,
  0.03582682,
  -0.014137197,
  -0.017960614,
  -0.0046583125,
  0.021548014,
  -0.008396174,
  0.0034723459,
  -0.03703049,
  0.021253,
  0.034457944,
  -0.055887952,
  0.0042511895,
  0.033938713,
  -0.03264064,
  -0.061693877,
  0.029407257,
  -0.01580109,
  0.06825505,
  0.010567496,
  -0.026834713,
  0.02820359,
  0.015423469,
  -0.01577749,
  -0.0069387914,
  -0.01128

In [16]:
index = pc.Index('rag')
index.upsert(
    vectors = processed_data,
    namespace = "ns1",
)

{'upserted_count': 20}

In [17]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}