In [9]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec



In [6]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name='rag', dimension=1536, metric='cosine', spec=ServerlessSpec(cloud='aws', region='us-east-1')
)

In [7]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Carter',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Dr. Carter is incredibly knowledgeable and passionate about physics. The lectures are engaging, though sometimes the material can be challenging.'},
 {'professor': 'Prof. James Mitchell',
  'subject': 'Calculus',
  'stars': 3,
  'review': 'Prof. Mitchell is a good teacher, but his explanations can be a bit unclear at times. Office hours are helpful.'},
 {'professor': 'Dr. Linda Wong',
  'subject': 'Chemistry',
  'stars': 5,
  'review': 'Dr. Wong makes chemistry fun and easy to understand. Her enthusiasm for the subject is contagious.'},
 {'professor': 'Dr. Richard Lee',
  'subject': 'Biology',
  'stars': 2,
  'review': "Dr. Lee's lectures are dry and not very engaging. The course material is interesting, but his teaching style is not very effective."},
 {'professor': 'Prof. Susan Clark',
  'subject': 'History',
  'stars': 4,
  'review': "Prof. Clark brings history to life with her storytelling. The as

In [12]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model='text-embedding-3-small',
    )

    embedding = response.data[0].embedding
    processed_data.append({
        'values': embedding,
        'id': review['professor'],
        'metadata': {
            'review': review["review"],
            'subject': review["subject"],
            'stars': review["stars"]
        }
    })

In [13]:
processed_data[0]

{'values': [-0.016342942,
  0.005649806,
  -0.054084923,
  0.023710033,
  0.04412594,
  0.050739728,
  -0.010029203,
  0.02354405,
  -0.0040761596,
  -0.0056019262,
  -0.013138193,
  0.024016464,
  -0.005435943,
  0.007807585,
  0.028549077,
  0.019483851,
  0.010010052,
  -0.02225449,
  0.011101709,
  0.016981337,
  0.0025376247,
  -0.026033795,
  0.045479342,
  -0.0042261826,
  -0.06940643,
  -0.015819456,
  0.06761892,
  0.033349816,
  0.022637527,
  0.018513488,
  0.096729785,
  -0.016151423,
  -0.014593736,
  -0.027629785,
  -0.044100408,
  0.071704656,
  -0.001163477,
  0.028804434,
  0.017006874,
  0.0063456576,
  0.026199779,
  0.034345713,
  -0.058426015,
  0.00029645523,
  0.010648448,
  -0.04604113,
  -0.028625684,
  -0.004682636,
  0.025050664,
  0.0021019194,
  -0.03674608,
  0.030719623,
  0.046373095,
  0.0017332456,
  -0.025318792,
  0.054442424,
  0.0022040626,
  0.0019534922,
  0.009971748,
  -0.027527642,
  0.027221212,
  0.012761539,
  -0.017109018,
  -0.05740458,
 

In [14]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 21}

In [15]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 21}},
 'total_vector_count': 21}