In [20]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [None]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

pc.create_index(
    name="rag",
    dimension=1536, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [18]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. John Smith',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Explains concepts clearly but can be strict with deadlines.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Makes math fun and easy to understand. Highly recommend!'},
 {'professor': 'Dr. William Brown',
  'subject': 'Chemistry',
  'stars': 3,
  'review': 'Knowledgeable but lectures are a bit dry.'},
 {'professor': 'Dr. Jessica Davis',
  'subject': 'Biology',
  'stars': 4,
  'review': 'Engaging and enthusiastic about the subject.'},
 {'professor': 'Dr. Michael Wilson',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Very helpful and offers great coding tips.'},
 {'professor': 'Dr. Sarah Moore',
  'subject': 'History',
  'stars': 2,
  'review': 'Hard to follow, lectures can be disorganized.'},
 {'professor': 'Dr. David Taylor',
  'subject': 'Philosophy',
  'stars': 4,
  'review': 'Encourages critical thinking and discussions.'},
 {'professor': 'Dr

In [21]:
processed_data = []
client = OpenAI()

for rev in data['reviews']:
    response = client.embeddings.create(
        input=rev['review'],
        model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": rev["professor"],
        "metadata": {
            "review": rev["review"],
            "subject": rev["subject"],
            "stars": rev["stars"]
        }
    })


In [22]:
processed_data[0]

{'values': [0.0054207947,
  0.03862835,
  0.03370927,
  -0.03636497,
  0.022950672,
  -0.004515443,
  -0.042491183,
  0.014681791,
  0.013338852,
  0.04653509,
  0.0034365652,
  0.020234616,
  -0.00894035,
  -0.017775076,
  0.01735258,
  -0.039081026,
  -0.03597265,
  -0.02353915,
  0.010494538,
  0.03708925,
  0.0035553926,
  0.0024482226,
  0.020792916,
  0.064340346,
  -0.026783329,
  0.013474654,
  0.020249706,
  0.0114602465,
  0.04731973,
  0.023131741,
  0.0621675,
  -0.023267545,
  0.021592643,
  -0.038175672,
  -0.001372174,
  0.07067781,
  0.04520724,
  0.037179787,
  0.023765488,
  0.023795666,
  -0.039473344,
  0.015919104,
  -0.03370927,
  -0.020460954,
  -0.004907762,
  0.010343646,
  -0.01602473,
  0.01735258,
  0.04656527,
  0.03902067,
  -0.007687947,
  0.030932859,
  0.04686705,
  0.027220914,
  -0.023795666,
  0.016749011,
  0.026104314,
  0.021049432,
  0.033256594,
  -0.004153302,
  0.054109868,
  -0.037753176,
  0.015164645,
  -0.0036100908,
  -0.03896031,
  -0.02

In [23]:
index = pc.Index('rag')

index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [24]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}