In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI

  from tqdm.autonotebook import tqdm


In [19]:
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

pc.create_index(
    name="rag",
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [3]:
import json

data = json.load(open('reviews.json'))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Physics',
  'stars': 5,
  'review': 'Excellent professor, explains complex topics very clearly and is always available for extra help.'},
 {'professor': 'Dr. Michael Smith',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Good lecturer, but sometimes goes too fast. Very knowledgeable.'},
 {'professor': 'Dr. Sarah Lee',
  'subject': 'Chemistry',
  'stars': 3,
  'review': 'Decent instructor, but her grading is quite tough.'},
 {'professor': 'Dr. Robert Brown',
  'subject': 'Biology',
  'stars': 4,
  'review': 'Engaging lectures, but exams are tricky. Study hard!'},
 {'professor': 'Dr. Angela Davis',
  'subject': 'History',
  'stars': 5,
  'review': 'Incredible professor! Makes history come alive. Highly recommend.'},
 {'professor': 'Dr. James Wilson',
  'subject': 'Philosophy',
  'stars': 2,
  'review': 'Not the best communicator. Lectures can be hard to follow.'},
 {'professor': 'Dr. Karen Martinez',
  'subject': 'Economics',
  's

In [5]:
import requests

model_id = "sentence-transformers/all-MiniLM-L6-v2"
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"}

def query(texts):
    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    return response.json()

In [6]:
processed_data = []

for review in data["reviews"]:
    embedding = query(review['review'])
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )

In [8]:
processed_data[0]

{'values': [-0.031288567930459976,
  0.02630036137998104,
  -0.03574352711439133,
  -0.00034446644713170826,
  -0.036232393234968185,
  -0.021974170580506325,
  -0.03027976304292679,
  0.037551477551460266,
  -0.07941596955060959,
  -0.015423104166984558,
  -0.015869364142417908,
  0.043549664318561554,
  -0.0695163682103157,
  0.07969341427087784,
  0.0004543155082501471,
  -0.02378178760409355,
  -0.00823245383799076,
  -0.07009415328502655,
  -0.054169755429029465,
  -0.07645558565855026,
  0.05924540385603905,
  -0.015163403935730457,
  0.04527553170919418,
  -0.06860974431037903,
  -0.0007703079027123749,
  -0.03803900256752968,
  0.020474208518862724,
  -0.026106571778655052,
  0.10894818603992462,
  -0.00905967690050602,
  -0.043471843004226685,
  0.09151977300643921,
  0.009875892661511898,
  0.01409875974059105,
  -0.015470619313418865,
  0.05787762999534607,
  0.0010940900538116693,
  0.11129643023014069,
  0.029971672222018242,
  -0.0062044295482337475,
  -0.0120516810566186

In [20]:
index = pc.Index("rag")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)
print(f"Upserted count: {upsert_response['upserted_count']}")

Upserted count: 20


In [21]:
print(index.describe_index_stats())

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}
