In [14]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI 
from pinecone import Pinecone, ServerlessSpec


In [9]:
#initialize PC & create RAG index in PC
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [13]:
import json
# Load the review data
data = json.load(open("reviews.json"))

data['reviews']

[{'professor': 'Dr. John Smith',
  'subject': 'Computer Science',
  'stars': 4,
  'review': 'Great lectures, but exams are tough.'},
 {'professor': 'Prof. Jane Doe',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Explains concepts very clearly. Highly recommended!'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Physics',
  'stars': 3,
  'review': 'Good teacher, but her classes are a bit disorganized.'},
 {'professor': 'Prof. Mark Lee',
  'subject': 'Chemistry',
  'stars': 2,
  'review': 'Not very engaging and hard to follow.'},
 {'professor': 'Dr. Karen Brown',
  'subject': 'Biology',
  'stars': 4,
  'review': 'Interesting lectures, but too much homework.'},
 {'professor': 'Prof. James Wilson',
  'subject': 'History',
  'stars': 5,
  'review': 'Amazing professor, makes history come alive.'},
 {'professor': 'Dr. Sarah Davis',
  'subject': 'English',
  'stars': 3,
  'review': 'Decent professor, but very strict on grading.'},
 {'professor': 'Prof. Robert Garcia',
  'subject':

In [15]:
processed_data = []
client = OpenAI()

# Create embeddings for each review
for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'], model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )


In [16]:
processed_data[0]

{'values': [-0.022409441,
  -0.006366472,
  0.009825262,
  -0.036190506,
  0.005030967,
  0.01242189,
  -0.016174829,
  0.008073891,
  0.022179533,
  0.0017006561,
  -0.0046184817,
  -0.019921007,
  -0.028021945,
  -0.032160323,
  -0.01986691,
  -0.025263028,
  -0.015052329,
  -0.028157186,
  -0.016810462,
  0.04062641,
  0.02388357,
  0.004425763,
  0.05723401,
  0.012002643,
  -0.026885921,
  -0.03886828,
  0.023274984,
  0.016634649,
  0.04636063,
  0.040085446,
  0.06816149,
  -0.020353777,
  0.016188353,
  -0.013835159,
  -0.0664845,
  0.0649157,
  0.024911402,
  0.05680124,
  0.03729948,
  -0.00020095975,
  0.01943414,
  0.022030767,
  -0.042303402,
  -0.0037833683,
  0.029725982,
  -0.0047300556,
  -0.0378134,
  -0.040301833,
  0.0181223,
  0.02915797,
  -0.049930997,
  0.061453532,
  0.08357897,
  0.020434922,
  -0.01136701,
  -0.028508814,
  -0.008655427,
  0.03067267,
  -0.0137472525,
  -0.04016659,
  0.060696185,
  -0.02487083,
  0.04863268,
  -0.04108623,
  -0.025560558,
  

In [17]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [18]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}