In [8]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [12]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name = "rag", dimension = 1536, metric = "cosine", spec = ServerlessSpec(cloud="aws", region="us-east-1")
)

In [13]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Biology',
  'stars': 4,
  'review': "Dr. Johnson's lectures are engaging and she explains complex concepts clearly. Very approachable during office hours."},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Brilliant professor! His passion for programming is contagious. Challenging assignments, but extremely rewarding.'},
 {'professor': 'Dr. Sarah Thompson',
  'subject': 'Psychology',
  'stars': 3,
  'review': 'Interesting course material, but lectures can be dry. Grading is fair, though feedback is sometimes delayed.'},
 {'professor': 'Prof. Robert Garcia',
  'subject': 'History',
  'stars': 4,
  'review': 'Prof. Garcia brings history to life with his storytelling. Exams are tough but fair. Highly recommended!'},
 {'professor': 'Dr. Amanda Lee',
  'subject': 'Chemistry',
  'stars': 2,
  'review': "Course content is difficult, and Dr. Lee's teaching style doesn't help. Lab sessions are dis

In [14]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input = review['review'],
        model = "text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review['professor'],
        "metadata":{
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [10]:
processed_data[0]

{'values': [-0.0059525007,
  -6.742035e-06,
  0.03189642,
  0.024431018,
  0.0142305065,
  -0.0024857142,
  0.015591456,
  0.057291996,
  -0.004294257,
  0.0071482863,
  0.01923827,
  -0.0041423063,
  -0.014666539,
  0.012374067,
  -0.005192748,
  0.03057511,
  -0.034406908,
  -0.022937939,
  0.035596088,
  0.0618373,
  0.03501471,
  -0.016238898,
  0.032292813,
  -0.001374988,
  -0.028778128,
  -0.04191195,
  0.010596905,
  0.012763853,
  0.027800359,
  -0.018207649,
  0.07510325,
  -0.007247384,
  -0.034750447,
  -0.036441725,
  -0.039163623,
  0.05142538,
  0.009995709,
  0.020586006,
  0.014322999,
  0.011660559,
  0.010068381,
  0.0074257613,
  -0.022488693,
  -0.01290259,
  0.028672423,
  0.016873127,
  -0.00095960125,
  -0.015234702,
  0.046377975,
  0.053328063,
  -0.039507166,
  -0.012010707,
  0.02217158,
  -0.024113905,
  -0.050685447,
  -0.01761306,
  0.015036506,
  0.02379679,
  0.0031513239,
  -0.029544488,
  0.049998365,
  -0.016622078,
  -0.021008827,
  -0.0030092832,
 

In [15]:
index = pc.Index('rag')
index.upsert(
    vectors = processed_data,
    namespace = "ns1"
)


{'upserted_count': 20}

In [16]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}