In [11]:
from dotenv import load_dotenv
load_dotenv()
import os
import google.generativeai as genai
from pinecone import Pinecone, ServerlessSpec

In [9]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=768, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1") # dimensions of Gemini
)

In [10]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Jane Smith',
  'subject': 'Physics 101',
  'stars': 4,
  'review': 'Dr. Smith explains concepts clearly, but her exams are quite challenging.'},
 {'professor': 'Dr. John Doe',
  'subject': 'Mathematics 201',
  'stars': 5,
  'review': 'Excellent professor! Makes difficult topics easy to understand.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Chemistry 101',
  'stars': 3,
  'review': 'Dr. Johnson is knowledgeable, but sometimes goes too fast during lectures.'},
 {'professor': 'Dr. Mark Wilson',
  'subject': 'History 202',
  'stars': 2,
  'review': "His lectures are dry, and he doesn't engage the class much."},
 {'professor': 'Dr. Sarah Brown',
  'subject': 'English Literature 150',
  'stars': 5,
  'review': 'Her passion for literature is contagious, and she encourages great class discussions.'},
 {'professor': 'Dr. Robert Davis',
  'subject': 'Biology 101',
  'stars': 4,
  'review': 'Very knowledgeable, but his grading can be a bit harsh.'},
 {'professor': 'D

In [13]:
processed_data = [] # List to hold processed embeddings
client = genai.GenerativeModel("gemini-1.5-flash")

for review in data['reviews']:
    review_text = review['review']  # Extract the review text
    response = genai.embed_content(
        model="models/text-embedding-004",
        content=review_text,
        task_type="retrieval_document",
        title=f"Embedding of review by {review['professor']} on {review['subject']}"
    )
    # Extract the embedding from the result
    embedding = response['embedding']
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

I0000 00:00:1724129327.719605 3611536 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


In [14]:
processed_data[0]

{'values': [0.034112513,
  -0.023869386,
  -0.019667348,
  -0.02323828,
  -0.036649585,
  0.025689412,
  0.014374256,
  0.07977184,
  0.013841738,
  0.038079362,
  0.03924921,
  0.026629526,
  0.05551911,
  0.031355888,
  -0.009864938,
  -0.03350354,
  0.046322655,
  -0.015301733,
  -0.08566757,
  0.0038336895,
  -0.010693962,
  -0.029095262,
  -0.011987858,
  -0.022818845,
  -0.010984386,
  0.013846118,
  0.013125802,
  -0.030103395,
  0.0016787248,
  -0.024400707,
  0.06421225,
  0.010635858,
  -0.04202749,
  -0.024414664,
  0.02190823,
  0.05778763,
  0.035973985,
  -0.020531092,
  0.0110587375,
  -0.04255406,
  -0.045646127,
  0.063079044,
  0.0038625633,
  0.030923778,
  -0.041718643,
  -0.0118690105,
  0.019273646,
  0.12509301,
  -0.019782154,
  0.05967212,
  0.03426122,
  0.08600281,
  -0.07228912,
  0.039158504,
  -0.000103677456,
  -0.03757721,
  -0.056671225,
  -0.028839055,
  0.012785369,
  0.028560383,
  -0.026377503,
  0.038443808,
  -0.048432115,
  -0.029977664,
  0.0319

In [15]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [16]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}