In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
from dotenv import load_dotenv
load_dotenv()
from pinecone import Pinecone, ServerlessSpec
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
from pinecone import Pinecone, ServerlessSpec

api_key = os.getenv("PINECONE_API_KEY")
print(f"API Key: {api_key}")  # This will print your API key - be careful not to share this output

pc = Pinecone(api_key=api_key)

if 'rag' in pc.list_indexes().names():
    print("Deleting existing 'rag' index...")
    pc.delete_index("rag")

print("Creating new 'rag' index...")
pc.create_index(
    name='rag',
    dimension=384,
    metric='cosine',
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

# Verify the new index
index = pc.Index("rag")
index_stats = index.describe_index_stats()
print(f"New index dimension: {index_stats.dimension}")

API Key: 5186cb06-98ae-4db6-9b16-6eb0e29cad83
Deleting existing 'rag' index...
Creating new 'rag' index...
New index dimension: 384


In [3]:
# Load your data
with open("reviews.json") as f:
    data = json.load(f)
    print(data)

# Initialize Hugging Face tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def get_embedding(text):
    # Tokenize input text and obtain model outputs
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Compute mean of the last hidden states to get embeddings
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()
    return embeddings

# Process data and generate embeddings
processed_data = []
for review in data["reviews"]:
    embedding = get_embedding(review['review'])
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata": {
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )

{'reviews': [{'professor': 'Dr. Sarah Johnson', 'subject': 'Physics', 'stars': 4, 'review': 'Dr. Johnson explains complex concepts clearly. Her enthusiasm for physics is contagious!'}, {'professor': 'Prof. Michael Chen', 'subject': 'Computer Science', 'stars': 5, 'review': "Brilliant teacher! Prof. Chen's practical approach to coding made the course incredibly valuable."}, {'professor': 'Dr. Emily Rodriguez', 'subject': 'Biology', 'stars': 3, 'review': 'Knowledgeable, but lectures can be dry. More interactive sessions would be helpful.'}, {'professor': 'Prof. David Kim', 'subject': 'Mathematics', 'stars': 4, 'review': 'Challenging course, but Prof. Kim is always available for extra help. Appreciate his patience.'}, {'professor': 'Dr. Lisa Patel', 'subject': 'Chemistry', 'stars': 5, 'review': "Dr. Patel's lab sessions are fantastic! She makes organic chemistry understandable and fun."}, {'professor': 'Prof. James Wilson', 'subject': 'History', 'stars': 2, 'review': 'Lectures are disorga



In [4]:
processed_data[0]

{'values': [-0.06525049358606339,
  -0.1752854287624359,
  0.14954109489917755,
  0.5691870450973511,
  -0.0316300205886364,
  -0.450634241104126,
  0.1190328374505043,
  0.08532914519309998,
  -0.006843383423984051,
  0.20580554008483887,
  -0.21674731373786926,
  0.08734722435474396,
  -0.23370248079299927,
  0.2023642510175705,
  -0.20096904039382935,
  -0.024719974026083946,
  -0.00418092543259263,
  0.17435552179813385,
  -0.5594282150268555,
  0.19380870461463928,
  0.017962424084544182,
  0.06267759948968887,
  0.31484082341194153,
  0.10961011797189713,
  -0.06372778117656708,
  0.2768474817276001,
  0.08036009967327118,
  0.0034096555318683386,
  0.1395070105791092,
  0.08986468613147736,
  0.08472255617380142,
  0.44486045837402344,
  -0.32222458720207214,
  0.21782149374485016,
  -0.16375066339969635,
  0.27343785762786865,
  -0.053464796394109726,
  0.13098804652690887,
  0.24649234116077423,
  -0.09748782962560654,
  -0.1599610596895218,
  -0.22449393570423126,
  0.2335939

In [5]:
import time
index = pc.Index("rag")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)
print(f"Upserted count: {upsert_response['upserted_count']}")

# Wait for a few seconds
time.sleep(5)

# Check stats again
print(index.describe_index_stats())

Upserted count: 20
{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}


In [6]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}