In [2]:
from dotenv import load_dotenv

load_dotenv()

import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
import google.generativeai as genai


  from tqdm.autonotebook import tqdm


In [20]:
#NOTE: Will show error if you run it because index with that configuration is already made on pinecone
#instantiate using env api key
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
#create pinecone index - one separate db inside pinecode instance
#NOTE: always make sure dimension of pinecone matches dimension of embedding model.
    #maybe need to change this to match our open ai alternative
    #1536 is for open ai
    #NOTE: Deviation from video since we're using genai instead of openai. seems like 768 is for gemini.
pc.create_index(
    name="rag", 
    dimension=768, 
    metric="cosine", 
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [5]:
#read the file
import json
data = json.load(open("reviews.json"))
data["reviews"]


[{'professor': 'Dr. Emily Johnson',
  'subject': 'Biology',
  'stars': 4,
  'reviews': "Dr. Johnson's lectures are engaging and informative. She really knows how to break down complex concepts."},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'reviews': 'Brilliant professor! His passion for coding is contagious. Challenging but rewarding classes.'},
 {'professor': 'Dr. Sarah Williams',
  'subject': 'Psychology',
  'stars': 3,
  'reviews': 'Interesting material, but sometimes the pace is too fast. Office hours are helpful.'},
 {'professor': 'Prof. Robert Taylor',
  'subject': 'History',
  'stars': 4,
  'reviews': 'Prof. Taylor brings history to life with his storytelling. Assignments can be tough but fair.'},
 {'professor': 'Dr. Lisa Martinez',
  'subject': 'Chemistry',
  'stars': 5,
  'reviews': "Best chemistry professor I've had. Clear explanations and fun lab experiments."},
 {'professor': 'Prof. David Wilson',
  'subject': 'Mathematics',
  'st

In [16]:
processed_data = []

#NOTE: Deviation from video. Uses gemini instead of openai

# NOTE: Based on video
# client = OpenAI()


# Alternative: Open router. Seems like doesnot have embeddings support
# client = OpenAI(
#   base_url="https://openrouter.ai/api/v1",
#   api_key=os.getenv("OPENROUTER_API_KEY"),
# )


# gemini api setup
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
model = genai.GenerativeModel('gemini-1.5-flash')


#create our embeddings - something we use to search similar strings. It captures the semantic representation of the text in numerical form.
#captures how closely related a certain text is among each other

#Steps: take reviews convert to semantic embeddings
for review in data["reviews"]:
    #create embeddings here

    response = genai.embed_content(
        model="models/text-embedding-004",
        content=review["reviews"],
    )
    embedding = response["embedding"]

    processed_data.append({
        "values": embedding, 
        "id": review["professor"],
        "metadata": {
            "review": review["reviews"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

    # NOTE: Doesnot work. feel free to delete or try with other model
    # response = client.embeddings.create(
    #     input=review["reviews"],
    #     model="meta-llama/llama-3.1-8b-instruct:free",
    # )
    # embedding = response.data[0].embedding
    # print(embedding)


    # NOTE: Based on video. Uses open ai for creating embeddings
    # response = client.embeddings.create(
    #     input=review["reviews"],
    #     model="text-embedding-3-small",
    # )
    # embedding = response.data[0].embedding
    # #metadata should be written based on database used - in our case pinecone
    # processed_data.append({
    #     "values": embedding, 
    #     "id": review["professor"],
    #     "metadata": {
    #         "review": review["reviews"],
    #         "subject": review["subject"],
    #         "stars": review["stars"]

    #     }
    # })

processed_data[0]

[{'values': [0.05190275,
   0.0016420824,
   -0.010754086,
   -0.010439071,
   0.006960306,
   0.0044909683,
   -0.001907013,
   0.02414344,
   -0.04980929,
   0.015840836,
   0.044957083,
   0.03198357,
   0.014761071,
   -0.016197536,
   -0.026853578,
   -0.06504145,
   0.01530239,
   0.019971944,
   -0.10869074,
   0.02205656,
   0.004697007,
   -0.034688674,
   0.017176881,
   -0.030688703,
   0.033590518,
   -0.021021698,
   0.02143417,
   -0.034391712,
   0.07420977,
   -0.056694224,
   0.052868854,
   -0.0020542867,
   0.014704116,
   -0.059394218,
   -0.053758748,
   0.030257113,
   -0.009222653,
   -0.020722078,
   0.05507583,
   -0.024433441,
   -0.03671736,
   -0.046570115,
   0.0017435743,
   0.010551587,
   -0.054158036,
   0.004883849,
   -0.010425051,
   0.0815218,
   0.025318211,
   0.06932546,
   0.04981287,
   0.036201835,
   -0.07837261,
   0.08197217,
   -0.01611879,
   -0.037976284,
   -0.029167624,
   -0.01800956,
   0.03570417,
   -0.030809874,
   -0.029349223,
 

In [21]:
# add into our database
    #index is like collection and namespace is like document
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [22]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}