In [1]:
from dotenv import load_dotenv
load_dotenv()
from openai import OpenAI
import os
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [2]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(name="rag2", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud='aws', region='us-east-1'))

In [3]:
import json
data = json.load(open('review.json'))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Biology',
  'stars': 4,
  'review': 'Engaging lectures and helpful office hours. Tough but fair grading.'},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Excellent at explaining complex concepts. Very approachable and supportive.'},
 {'professor': 'Dr. Sarah Williams',
  'subject': 'Psychology',
  'stars': 3,
  'review': 'Interesting material, but lectures can be disorganized at times.'},
 {'professor': 'Prof. David Brown',
  'subject': 'History',
  'stars': 4,
  'review': 'Passionate about the subject. Assignments are challenging but rewarding.'},
 {'professor': 'Dr. Lisa Martinez',
  'subject': 'Chemistry',
  'stars': 5,
  'review': 'Clear explanations and well-structured labs. Always willing to help students.'},
 {'professor': 'Prof. Robert Taylor',
  'subject': 'Economics',
  'stars': 2,
  'review': 'Lectures are dry and hard to follow. Exam questions often unclear.'},
 {'professor'

In [4]:
process_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small")
    embedding = response.data[0].embedding
    process_data.append({
        "values": embedding, 
        'id': review['professor'], 
        "metadata": {
            "review": review['review'], 
            "subject": review['subject'],
            "stars": review['stars']
    }})

In [5]:
process_data[0]

{'values': [-0.026965339,
  0.027465183,
  0.029753948,
  0.024755497,
  0.03388425,
  0.0030993696,
  -0.015626742,
  -0.005287837,
  0.0069583724,
  0.05466729,
  0.042197466,
  0.00930633,
  -0.028254412,
  -0.022335192,
  0.008747293,
  0.0012890747,
  -0.0031010138,
  0.029885488,
  0.0036764937,
  0.014955897,
  0.018125968,
  -0.004998453,
  0.06876819,
  -0.00090432534,
  -0.02941195,
  -0.04338131,
  -0.0027014664,
  -0.0155741265,
  0.007195141,
  0.030201178,
  0.06161251,
  -0.021401271,
  0.016784279,
  -0.030543178,
  -0.04156608,
  0.06387497,
  0.014271898,
  0.034515634,
  0.050379142,
  0.027544105,
  -0.006103374,
  0.011930518,
  0.010240251,
  -0.03596255,
  0.013837822,
  -0.022545652,
  0.015021666,
  -0.02453188,
  0.047327455,
  0.019559735,
  -0.033147633,
  0.03127979,
  0.07766017,
  0.008609178,
  -0.043355003,
  -0.008207986,
  0.008201409,
  0.015166358,
  0.002257525,
  -0.0032522827,
  0.038645934,
  -0.025702571,
  0.015626742,
  -0.0155741265,
  -0.02

In [6]:
index = pc.Index('rag2')
index.upsert(
    vectors=process_data,
    namespace="nsl"
)

{'upserted_count': 20}

In [7]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'nsl': {'vector_count': 20}},
 'total_vector_count': 20}