In [4]:
#run this in case kernel doesn't have the required libraries
%pip install python_dotenv
%pip install openai
%pip install "pinecone-client[grpc]"

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting pinecone-client[grpc]
  Using cached pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client[grpc])
  Using cached pinecone_plugin_inference-1.0.3-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client[grpc])
  Using cached pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting urllib3>=1.26.0 (from pinecone-client[grpc])
  Downloading urllib3-2.2.2-py3-none-any.whl.metadata (6.4 kB)
Collecting googleapis-common-protos>=1.53.0 (from pinecone-client[grpc])
  Using cached googleapis_common_protos-1.63.2-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting grpcio>=1.44.0 (from pinecone-client[grpc])
  Downloading grpcio-1.66.0-cp310-cp310-win_amd64.whl.metadata (4.0 kB)
Collecting lz4>=3.1.3 (from pinecon

In [5]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
import json

In [2]:
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Create a Pinecone index
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [4]:
# Load the review data
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Engaging lectures and clear explanations. Challenging but fair exams.'},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Excellent teacher! Makes complex concepts easy to understand.'},
 {'professor': 'Dr. Sarah Williams',
  'subject': 'Psychology',
  'stars': 3,
  'review': 'Interesting content, but sometimes disorganized in class.'},
 {'professor': 'Prof. David Brown',
  'subject': 'History',
  'stars': 4,
  'review': 'Passionate about the subject. Assignments can be time-consuming.'},
 {'professor': 'Dr. Lisa Garcia',
  'subject': 'Biology',
  'stars': 5,
  'review': 'Brilliant researcher and inspiring teacher. Highly recommended!'},
 {'professor': 'Prof. Robert Taylor',
  'subject': 'Mathematics',
  'stars': 2,
  'review': 'Difficult to follow in class. Office hours are helpful though.'},
 {'professor': 'Dr. Amanda Lee',
  'subject': 'English Liter

In [6]:
processed_data = []
client = OpenAI()

# Create embeddings for each review
for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'], model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )

In [7]:
processed_data[0]

{'values': [-0.01611612,
  0.0020509618,
  0.010668977,
  0.011643101,
  0.031861145,
  0.019455973,
  -0.03869989,
  0.011152726,
  0.011258753,
  0.053649712,
  0.042490363,
  -0.014499206,
  -0.053861767,
  -0.010993686,
  -0.0018124008,
  -0.006964655,
  0.0005069421,
  0.02860081,
  0.011106339,
  0.030138204,
  0.044266317,
  -0.012444931,
  0.05656546,
  0.017786046,
  -0.033743124,
  -0.073635824,
  -0.0070640557,
  -0.02347175,
  -0.013677496,
  0.028680332,
  0.08434456,
  -0.015029342,
  0.0049136933,
  -0.0137967765,
  -0.044822957,
  0.0794143,
  0.021311447,
  0.024479007,
  0.025419997,
  0.003308377,
  0.005500156,
  0.006692961,
  -0.028468277,
  -0.04800377,
  0.02088734,
  -0.020860832,
  -0.0017759539,
  -0.02699715,
  0.038673386,
  0.016500467,
  -0.014499206,
  0.03862037,
  0.09319782,
  0.011232246,
  -0.03936256,
  -0.0056857034,
  0.013637736,
  0.04643987,
  -0.017812554,
  0.00483086,
  0.06775132,
  -0.030482791,
  0.020635525,
  0.0044995253,
  -0.0145257

In [8]:
# Insert the embeddings into the Pinecone index
index = pc.Index("rag")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)
print(f"Upserted count: {upsert_response['upserted_count']}")

Upserted count: 20


In [9]:
# Print index statistics
print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}
