In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import json


  from tqdm.autonotebook import tqdm


In [2]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY")) 
index_spec = ServerlessSpec(cloud='aws', region='us-east-1')

# Create a Pinecone index using the spec object
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=index_spec
)

In [3]:
data = json.load(open('reviews.json'))
data['reviews']

[{'professor': 'Dr. John Smith',
  'subject': 'Physics',
  'star': 5,
  'review': 'Dr. Smith is an excellent professor who makes complex topics easy to understand.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Mathematics',
  'star': 4,
  'review': 'Her lectures are very detailed, but sometimes a bit fast-paced.'},
 {'professor': 'Dr. Michael Brown',
  'subject': 'Computer Science',
  'star': 3,
  'review': 'Dr. Brown knows his material, but his teaching style can be dry.'},
 {'professor': 'Dr. Sarah Davis',
  'subject': 'Biology',
  'star': 5,
  'review': 'Amazing professor with a deep passion for biology!'},
 {'professor': 'Dr. Robert Miller',
  'subject': 'Chemistry',
  'star': 4,
  'review': 'Very knowledgeable, but his exams are quite challenging.'},
 {'professor': 'Dr. Jessica Wilson',
  'subject': 'History',
  'star': 5,
  'review': 'Dr. Wilson brings history to life with her engaging lectures.'},
 {'professor': 'Dr. Daniel Moore',
  'subject': 'Political Science',
  'sta

In [5]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input = review['review'],
        model = "text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subjecct": review["subject"],
            "star": review["star"]
        }
    })

In [6]:
processed_data[0]

{'values': [-0.012872368,
  -0.002833021,
  -0.05378767,
  0.01159491,
  0.028703056,
  -0.008923864,
  0.011992206,
  0.02387439,
  0.022798637,
  -0.047235355,
  0.035915494,
  -0.0094495155,
  -0.027113875,
  0.021185007,
  0.028923098,
  0.0016854489,
  -0.031759176,
  -0.0048683956,
  0.009388394,
  0.046599682,
  0.034790844,
  -0.019925887,
  0.04136761,
  -0.018740114,
  -0.024436718,
  -0.028018488,
  0.027285019,
  0.035402067,
  0.010286892,
  0.013092408,
  0.06664781,
  -0.002651182,
  0.0041532638,
  -0.0027963477,
  -0.025989225,
  0.009058333,
  -0.030536728,
  -0.005504068,
  -0.012921265,
  0.0066623366,
  0.030634524,
  0.027138324,
  -0.024143329,
  -0.0101218615,
  0.04452152,
  0.0037559685,
  -0.014693813,
  -0.013288,
  0.02239523,
  0.05867746,
  -0.059313133,
  -0.0033311683,
  0.057943992,
  -0.019791419,
  -0.065718755,
  0.056917135,
  0.020940518,
  0.04207663,
  0.007695304,
  -0.04090308,
  0.05559689,
  0.02387439,
  -0.021233905,
  -0.021368375,
  -0.0

In [7]:
index = pc.Index('rag')
index.upsert(
    vectors = processed_data,
    namespace="ns1"
)

{'upserted_count': 40}

In [8]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 40}},
 'total_vector_count': 40}