In [19]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [20]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.delete_index(name="rag")
pc.create_index(
    name="rag",dimension=1536, metric="cosine",spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [21]:
import json
data= json.load(open("reviews.json"))
data["reviews"]


[{'professor': 'Dr. Emily Johnson',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Dr. Johnson is an excellent lecturer who makes complex concepts easy to understand. Her enthusiasm for physics is contagious.'},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Prof. Chen is incredibly knowledgeable and always available for extra help. His programming assignments are challenging but rewarding.'},
 {'professor': 'Dr. Sarah Martinez',
  'subject': 'Biology',
  'stars': 3,
  'review': 'Dr. Martinez knows her subject well, but her lectures can be dry at times. The lab work is interesting and hands-on.'},
 {'professor': 'Prof. David Thompson',
  'subject': 'History',
  'stars': 2,
  'review': "Prof. Thompson's lectures are disorganized and hard to follow. However, his reading list is excellent and provides good insights into historical events."},
 {'professor': 'Dr. Olivia Parker',
  'subject': 'Psychology',
  'stars': 5,
  'review': 'Dr. P

In [26]:
processed_data =[]

client = OpenAI(api_key=os.getenv("OPENAI_KEY"))
for review in data['reviews']:
    response = client.embeddings.create(
        input=review["review"],
        model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review['professor'],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [27]:
processed_data[0]

{'values': [-0.0006420118,
  -0.008185949,
  -0.02068431,
  0.032514818,
  0.022961363,
  0.010959122,
  -0.007359085,
  0.04437077,
  -6.1319086e-05,
  -0.008249554,
  0.021752872,
  0.02305041,
  -0.039765775,
  0.0060838074,
  0.018763442,
  0.028723966,
  -0.043124113,
  -0.016626317,
  0.038697213,
  0.044497978,
  0.018496301,
  -0.025709094,
  0.04228453,
  -0.011785986,
  -0.033252634,
  -0.05332634,
  0.03475371,
  0.018890651,
  0.049255624,
  -0.0026316522,
  0.07805592,
  0.008115984,
  -0.01094004,
  -0.011238984,
  -0.020010097,
  0.044421654,
  -0.008707509,
  0.008707509,
  -0.010043211,
  -0.010234026,
  -0.02291048,
  -0.01289271,
  -0.027095681,
  0.019259559,
  0.03396501,
  0.0032820122,
  -0.013420631,
  -0.011073611,
  0.048008967,
  0.052919265,
  -0.020010097,
  -0.018254602,
  0.048212502,
  -0.0075308187,
  -0.028316895,
  0.012956316,
  0.03488092,
  0.034295756,
  0.002666635,
  -0.041826572,
  0.028927503,
  -0.0027381903,
  -0.024755022,
  0.008211391,
  

In [28]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [29]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}