In [2]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
import google.generativeai as genai

  from tqdm.autonotebook import tqdm


In [16]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=768, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [3]:
import json
data = json.load(open("reviews.json"))
#data['reviews']
print(len(data['reviews']))

20


In [15]:
#Configure OpenAI client for OpenRouter
openrouter_client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=os.getenv("OPENROUTER_API_KEY")
)

genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# Prepare text for encoding
texts = [review['review'] for review in data['reviews']]

def get_embeddings_from_gemini(texts):
    embeddings = []
    for text in texts:
        result = genai.embed_content(
            model="models/text-embedding-004",
            content=text,
            task_type="retrieval_document",
            title="Embedding of single string"
        )
        embeddings.append(result['embedding'])
    return embeddings

embeddings = get_embeddings_from_gemini(texts)

In [17]:
processed_data = []
for i, review in enumerate(data['reviews']):
    processed_data.append({
        "values": embeddings[i],  # Ensure this matches the format Pinecone expects
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })
    
processed_data[0]

{'values': [0.008236302,
  0.028782578,
  -0.08337074,
  -0.018050484,
  0.00010225229,
  0.009591954,
  0.03371242,
  0.06473548,
  -0.002184598,
  0.035684604,
  0.060402818,
  0.030195476,
  0.038476054,
  0.035516843,
  -0.016753005,
  -0.07358404,
  0.05685032,
  0.0002107283,
  -0.08202255,
  0.028928902,
  -0.006262331,
  -0.015678657,
  0.057114486,
  -0.041231256,
  -0.0088272765,
  0.014692902,
  0.00645545,
  -0.07881951,
  0.001603456,
  -0.025616718,
  0.07407584,
  0.039109956,
  -0.0135693895,
  -0.015271243,
  0.013257823,
  0.046034187,
  0.017455483,
  -0.015292365,
  0.04922341,
  -0.012967765,
  -0.03218361,
  0.015594818,
  -0.04017848,
  0.050880745,
  -0.043391824,
  -0.018515078,
  0.016485684,
  0.07141408,
  -0.000469389,
  0.047576766,
  0.060544465,
  0.06923844,
  -0.044396784,
  0.034980267,
  -0.017248048,
  -0.02541262,
  -0.034563057,
  -0.043338485,
  0.051655173,
  -0.005071949,
  -0.04574957,
  -0.020482473,
  -0.028233824,
  -0.03908326,
  0.0174467

In [18]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [19]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}