In [7]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [4]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", 
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [5]:
import json
data = json.load(open('reviews.json'))
data['reviews']

[{'professor': 'Dr. Emily Carter',
  'subject': 'Physics 101',
  'stars': 5,
  'review': 'Dr. Carter explains complex concepts very clearly. Her lectures are engaging and the problem sets are challenging but fair.'},
 {'professor': 'Prof. John Smith',
  'subject': 'Calculus II',
  'stars': 4,
  'review': 'Prof. Smith is very knowledgeable and patient. The exams are tough but manageable if you follow along in class.'},
 {'professor': 'Dr. Susan Lee',
  'subject': 'Introduction to Philosophy',
  'stars': 3,
  'review': "Dr. Lee's lectures can be a bit dry, but she is always open to questions and provides good feedback on essays."},
 {'professor': 'Prof. Michael Brown',
  'subject': 'Computer Science 101',
  'stars': 2,
  'review': 'Prof. Brown is difficult to understand and his assignments are poorly explained. The grading is harsh.'},
 {'professor': 'Dr. Karen Wilson',
  'subject': 'Organic Chemistry',
  'stars': 4,
  'review': 'Dr. Wilson is an excellent professor who really cares abou

In [8]:
processed_data = []
client= OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",    
    )

    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [9]:
processed_data[0]

{'values': [-0.014843395,
  -0.022382278,
  -0.019114126,
  0.038358457,
  0.04619681,
  0.024387438,
  -0.0077211694,
  0.030598227,
  -0.015364216,
  0.004264221,
  0.03963447,
  -0.0054718745,
  -0.00649073,
  -0.019270374,
  0.023567146,
  0.020038584,
  0.009394307,
  -0.0269004,
  0.015403277,
  0.019530782,
  0.015976181,
  -0.027655588,
  0.063175574,
  0.009544043,
  -0.058279857,
  -0.041795876,
  0.052212294,
  0.039478224,
  -0.006913897,
  0.012291373,
  0.087862484,
  -0.017616766,
  -0.009114365,
  -0.051977925,
  -0.022733832,
  0.08879996,
  -0.01496058,
  0.026718112,
  0.013528323,
  0.021744272,
  0.0072654514,
  0.03744702,
  -0.051561266,
  -0.022030724,
  0.03721265,
  0.0020995592,
  -0.02841078,
  0.0028156878,
  0.017838115,
  0.02768163,
  -0.06614425,
  0.035363737,
  0.03708245,
  -0.023306735,
  -0.056248654,
  0.042368777,
  0.019504743,
  0.03669183,
  0.0056769475,
  -0.024153069,
  0.048748836,
  -0.012447619,
  -0.01248668,
  -0.010142987,
  -0.031822

In [10]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [11]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}