In [7]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [3]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
        name="rag",
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [4]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Sarah Johnson',
  'subject': 'Biology',
  'stars': 4,
  'reviews': 'Engaging lectures and challenging assignments. Very knowledgeable in her field.'},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'reviews': 'Excellent teacher! Makes complex topics easy to understand. Always available for office hours.'},
 {'professor': 'Dr. Emily Rodriguez',
  'subject': 'Psychology',
  'stars': 3,
  'reviews': 'Interesting course material, but lectures can be dry at times. Fair grader.'},
 {'professor': 'Prof. David Thompson',
  'subject': 'History',
  'stars': 4,
  'reviews': "Passionate about the subject. Assigns a lot of reading, but it's all relevant and interesting."},
 {'professor': 'Dr. Lisa Patel',
  'subject': 'Chemistry',
  'stars': 5,
  'reviews': 'Brilliant instructor. Tough exams, but prepares students well. Great lab sessions.'},
 {'professor': 'Prof. Robert Anderson',
  'subject': 'Economics',
  'stars': 2,
  'reviews': "Lectu

In [10]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['reviews'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review['professor'],
        "metadata": {
            "professor": review['professor'],
            "subject": review['subject'],
            "stars": review['stars']
        }
    })

In [11]:
processed_data[0]

{'values': [-0.010007053,
  -0.018906767,
  0.024470797,
  0.033766966,
  -0.001903664,
  0.013718684,
  0.0065859254,
  0.024306746,
  -0.012925776,
  0.03368494,
  0.036911257,
  0.019426258,
  -0.036008984,
  -0.02157258,
  0.02355485,
  0.024935605,
  -0.010314646,
  -0.0068798484,
  0.05047273,
  0.01602222,
  0.050882854,
  -0.0073002265,
  0.041778076,
  -0.0085784495,
  -0.018332591,
  -0.078087814,
  0.005734916,
  -0.009453383,
  -0.0129462825,
  0.013376913,
  0.059768897,
  -0.017895125,
  -0.02075233,
  -0.024867252,
  -0.021845996,
  0.070213415,
  0.0197817,
  0.035216074,
  0.035407465,
  0.018715374,
  0.047820587,
  -0.019809041,
  -0.046836287,
  0.03658316,
  0.0035099871,
  -0.009535409,
  -0.032974057,
  -0.005898966,
  0.058729913,
  0.028407998,
  -0.028900148,
  0.027697114,
  0.039481375,
  -0.027724456,
  -0.06266712,
  -0.01749867,
  0.0011227174,
  0.059222065,
  -0.019385247,
  -0.01177059,
  0.02075233,
  -0.015215641,
  0.02843534,
  0.020027775,
  -0.02

In [13]:
index = pc.Index("rag")
index.upsert(
    vectors = processed_data,
    namespace = "ns1"
)

{'upserted_count': 20}

In [14]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}