In [2]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [None]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [4]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Stone',
  'subject': 'Quantum Mechanics',
  'stars': 5,
  'review': 'An excellent professor who makes complex topics understandable. Her lectures are engaging and clear.'},
 {'professor': 'Dr. Michael Harris',
  'subject': 'Calculus II',
  'stars': 3,
  'review': 'Knows the material well, but the lectures can be a bit dry. Office hours are helpful.'},
 {'professor': 'Dr. Sarah Patel',
  'subject': 'Data Structures',
  'stars': 4,
  'review': 'Great professor! Assignments are challenging but fair, and she explains concepts very well.'},
 {'professor': 'Dr. Robert Jenkins',
  'subject': 'Operating Systems',
  'stars': 2,
  'review': 'The lectures were hard to follow, and the exams were tough. Not very approachable.'},
 {'professor': 'Dr. Laura Kim',
  'subject': 'Linear Algebra',
  'stars': 5,
  'review': 'Amazing instructor! She really cares about her students and makes the material accessible.'},
 {'professor': 'Dr. James Wong',
  'subject': 'Software Engineer

In [5]:
processed_data = []
client = OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subjects": review["subject"],
            "stars": review["stars"]
        }
    })

In [6]:
processed_data[0]

{'values': [-0.01335254,
  -0.0031866771,
  -0.038167424,
  0.025638817,
  -0.008711866,
  -0.005334352,
  -0.0068459036,
  0.043401815,
  0.0063794125,
  -0.021640325,
  0.033926602,
  0.008475591,
  -0.033223834,
  -0.012807292,
  0.012940574,
  0.019422978,
  -0.01858693,
  0.0059371553,
  0.032569535,
  0.03075204,
  0.037343495,
  -0.02609925,
  0.03681036,
  0.0070337113,
  -0.034992866,
  -0.051665366,
  0.0027459343,
  0.026317349,
  0.02573575,
  0.019968228,
  0.0684348,
  -0.0048315115,
  -0.013001158,
  -0.039476022,
  -0.010826221,
  0.025129918,
  -0.017241983,
  -0.016115135,
  -0.0019901586,
  0.022076523,
  0.017678183,
  0.018078031,
  -0.014842887,
  0.023373004,
  0.074735455,
  0.00772436,
  -0.043038316,
  0.0037803927,
  0.04735184,
  0.046624843,
  -0.06082555,
  -0.0053767604,
  0.044928513,
  0.022694472,
  -0.0706158,
  0.04475888,
  0.026317349,
  0.04240825,
  0.006991303,
  -0.033296537,
  0.028183311,
  -0.00682167,
  0.0030094713,
  0.010656588,
  -0.022

In [7]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [8]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}