In [5]:
# %pip install python-dotenv
# %pip install openai
# %pip install "pinecone-client[grpc]"
# %pip install -q -U google-generativeai

In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [12]:
pc = Pinecone(api_key=os.getenv("PINECONE_KEY"))

In [13]:
pc.create_index(
    name="rag",
    dimension=768,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

In [14]:
import json

data = json.load(open("linear_equations_dataset.json"))
# display(data["reviews"])

[{'professor': 'Dr. Alice Johnson',
  'subject': 'Calculus',
  'stars': 5,
  'review': 'Dr. Johnson explains concepts very clearly and makes complex topics easy to understand.'},
 {'professor': 'Dr. Michael Smith',
  'subject': 'Linear Algebra',
  'stars': 4,
  'review': 'Great professor, but sometimes moves too fast through the material.'},
 {'professor': 'Dr. Emily Davis',
  'subject': 'Statistics',
  'stars': 3,
  'review': 'Lectures are okay, but the exams are much harder than the examples in class.'},
 {'professor': 'Dr. Robert Brown',
  'subject': 'Differential Equations',
  'stars': 2,
  'review': 'Dr. Brown is knowledgeable, but his teaching style is hard to follow.'},
 {'professor': 'Dr. Linda White',
  'subject': 'Discrete Mathematics',
  'stars': 5,
  'review': 'Excellent professor! Makes the class very engaging and fun.'},
 {'professor': 'Dr. William Taylor',
  'subject': 'Abstract Algebra',
  'stars': 3,
  'review': 'The course content is challenging, but Dr. Taylor is hel

In [15]:
import google.generativeai as genai

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

In [19]:
process_data = []

# # Creating embeddings
# # embedding capture the semantic representation of the text in numerical form
# # man and uncle would be more semantically related than man and woman

for question in data["questions"]:
    response = genai.embed_content(
        model="models/text-embedding-004",
        content=questions["question"]
    )

    embedding = response["embedding"]

    process_data.append({
        "values": embedding,
        "id": questions["id"],
        "metadata": {
            "question": questions["question"],
            "solution": questions["solution"],
            "topic": questions["topic"],
            "difficulty": questions["difficulty"],
            "steps": questions["tags"]
        } 
    })


In [22]:
index = pc.Index("rag")
index.upsert(
    vectors=process_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [23]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}