In [2]:
from dotenv import load_dotenv
load_dotenv()

import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [4]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

pc.create_index(
  name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [10]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'id': '1',
  'professor': 'Dr. John Smith',
  'subject': 'Computer Science',
  'stars': 5,
  'comment': 'Dr. Smith is an outstanding professor who makes complex topics easy to understand. Highly recommended!'},
 {'id': '2',
  'professor': 'Dr. Emily Johnson',
  'subject': 'Mathematics',
  'stars': 4,
  'comment': 'Dr. Johnson explains concepts clearly and provides great support, but her exams are quite challenging.'},
 {'id': '3',
  'professor': 'Dr. Alan Brown',
  'subject': 'History',
  'stars': 3,
  'comment': 'Dr. Brown is knowledgeable, but his lectures can be a bit dry. The course material is interesting, though.'},
 {'id': '4',
  'professor': 'Dr. Laura Williams',
  'subject': 'Physics',
  'stars': 5,
  'comment': 'Dr. Williams is an excellent instructor. Her passion for physics is contagious, and she makes learning fun!'},
 {'id': '5',
  'professor': 'Dr. Michael Davis',
  'subject': 'Chemistry',
  'stars': 4,
  'comment': 'Dr. Davis is very thorough and ensures everyone unde

In [15]:
processed_data = []
client = OpenAI()

for review in data["reviews"]:
  response = client.embeddings.create(
    input = review['comment'],
    model = "text-embedding-3-small",
  )
  embedding = response.data[0].embedding
  processed_data.append({
    "values": embedding,
    "id": review["professor"],
    "metadata": {
      "review": review["comment"],
      "subject": review["subject"],
      "stars": review["stars"]
    }
  })

In [16]:
processed_data[0]

{'values': [-0.018445065,
  -0.011218592,
  -0.07042885,
  0.015980793,
  0.025677698,
  -0.0011605173,
  -0.0008717357,
  0.026096625,
  0.016867932,
  -0.050813258,
  0.027649114,
  -0.0064748707,
  -0.02150076,
  0.01084279,
  0.030680168,
  0.015869902,
  -0.035115853,
  -0.00035828183,
  0.02395271,
  0.065007456,
  0.024112886,
  -0.022400219,
  0.03188766,
  -0.0109352,
  -0.0209463,
  -0.032380514,
  0.019800413,
  0.044529367,
  0.015709724,
  0.001812779,
  0.058551066,
  0.0009641458,
  -0.0051780483,
  -0.017077394,
  -0.041153315,
  0.016559897,
  -0.025899483,
  0.021968972,
  -0.0076638814,
  0.0034561392,
  0.027230188,
  0.011526626,
  -0.019837378,
  -0.0076392386,
  0.032553013,
  0.008064325,
  -0.011600553,
  -0.019603271,
  0.018087745,
  0.039822612,
  -0.06806315,
  -0.0056739827,
  0.034746215,
  -0.023730924,
  -0.063528895,
  0.062099617,
  0.019184345,
  0.040586535,
  0.0101096695,
  -0.027106976,
  0.05273539,
  0.032134086,
  -0.024396278,
  -0.006505674,

In [17]:
index = pc.Index('rag')
index.upsert(
  vectors=processed_data,
  namespace="ns1",
)

{'upserted_count': 20}

In [18]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}