# ****Install Dependencies****

Uninstalls conflicting packages and installs required versions of google-genai, chromadb, and protobuf to ensure compatibility.

In [1]:
# Uninstall unused conflicting packages and install required versions
!pip uninstall -qqy jupyterlab kfp  
!pip install -qU "google-genai==1.7.0" "chromadb==0.6.3"
!pip install protobuf==4.23.4 --quiet  # ✅ Compatibility fix


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.9/100.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m4.9 MB/s[0

# ****Import Generative AI Modules****
Imports necessary classes and functions from the Google Generative AI SDK to interact with the Gemini model.

In [2]:
# Step 2: Import all necessary modules
from google import genai
from google.genai import types

from IPython.display import Markdown

genai.__version__



'1.7.0'

# ****Import Vector Store Tools****
Loads ChromaDB for handling document embeddings and vector database operations, along with display and OS utilities.

In [3]:
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings

from IPython.display import display
import os


 # ****Install Google Generative AI****
Installs the google-generativeai package to enable interaction with Gemini models.

In [4]:
!pip install google-generativeai

Collecting protobuf (from google-generativeai)
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Downloading protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.23.4
    Uninstalling protobuf-4.23.4:
      Successfully uninstalled protobuf-4.23.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opentelemetry-proto 1.32.1 requires protobuf<6.0,>=5.0, but you have protobuf 3.20.3 which is incompatible.
tensorflow-metadata 1.16.1 requires protobuf<6.0.0dev,>=4.25.2; python_version >= "3.11", but you have protobuf 3.20.3 which is incompatible.
google-spark-connect 0.5.2 requires google-api-core>

****Load API Key Securely****

In [5]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")



# ****Install PDF Processing Library****
Installs PyMuPDF to extract text content from PDF files.

In [6]:
!pip install -q PyMuPDF


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[?25h

# ****Extract Text from PDF****
Defines a function to extract full text from a PDF file using PyMuPDF and prints a preview.

In [7]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Example usage:
pdf_path = "/kaggle/input/gen-ai-notes/GENERATIVE_AI_WITH_LARGE_LANGUAGE_MODELS_1712321353.pdf"  # update with your actual file
full_text = extract_text_from_pdf(pdf_path)
print(full_text[:4000])  # Show a preview


Generative AI with Large
Language Models.
Course Notes : July, 2023
Generative AI, and LLMs specifically, is a General Purpose Technology that is useful for a variety of
applications. 
"LLMs can be, generally, thought of as a next word prediction model"
What is an LLM?
LLMs are machine learning models that have learned from massive datasets of human-generated
content, finding statistical patterns to replicate human-like abilities.
Foundation models, also known as base models, have been trained on trillions of words for weeks or
months using extensive compute power. These models have billions of parameters, which represent
their memory and enable sophisticated tasks.
Interacting with LLMs differs from traditional programming paradigms. Instead of formalized code
syntax, you provide natural language prompts to the models.
What is an LLM?
Page 1
What are the Use Cases for application of LLMs?
Page 2
What are Transformers? How was text generation done before Transformers? Transformer Archi

# ****Split Text into Chunks****
Breaks the extracted text into smaller chunks (~500 tokens) to make it suitable for embedding.

In [8]:
def split_text(text, max_tokens=500):
    # Rough split by sentence or paragraph
    import re
    sentences = re.split(r'\n|\.\s+', text)
    
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_tokens:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

text_chunks = split_text(full_text)
print(f"Total Chunks: {len(text_chunks)}")
print("Sample chunk:\n", text_chunks[0])


Total Chunks: 101
Sample chunk:
 Generative AI with Large. Language Models. Course Notes : July, 2023. Generative AI, and LLMs specifically, is a General Purpose Technology that is useful for a variety of. applications. "LLMs can be, generally, thought of as a next word prediction model". What is an LLM?. LLMs are machine learning models that have learned from massive datasets of human-generated. content, finding statistical patterns to replicate human-like abilities.


# ****Configure Gemini API****
Sets the API key to authenticate and enable the use of Gemini’s embedding and generation models.

In [9]:
from google.generativeai import configure
from google.generativeai import embed_content

# Already done by you, but ensure this is here
configure(api_key=GOOGLE_API_KEY)


 # ****Generate Embeddings****
Embeds each chunk of text using Gemini’s embedding model for storage and similarity search.

In [10]:
from google.generativeai import embed_content

# ✅ Use the newer model
EMBED_MODEL = "models/text-embedding-004"

def generate_embeddings(chunks):
    embeddings = []
    for i, chunk in enumerate(chunks):
        try:
            response = embed_content(
                model=EMBED_MODEL,
                content=chunk,
                task_type="retrieval_document",  # or "retrieval_query"
            )
            embeddings.append(response["embedding"])
        except Exception as e:
            print(f"❌ Error embedding chunk {i}: {e}")
            embeddings.append(None)
    return embeddings

# Assuming full_text is already loaded from your PDF extraction
text_chunks = split_text(full_text)  # Generate the chunks

# Call the function to generate embeddings
embeddings = generate_embeddings(text_chunks)

# Print the number of valid embeddings
valid_embeddings = [e for e in embeddings if e is not None]
print(f"✅ Total embeddings generated: {len(valid_embeddings)}")


✅ Total embeddings generated: 101


# ****Store Embeddings in ChromaD****
Initializes ChromaDB client and stores the text chunks with their embeddings into a named collection.

In [11]:
import chromadb

# Step 1: Initialize ChromaDB client
chroma_client = chromadb.Client()

# Step 2: Create a new collection (or get if it already exists)
collection = chroma_client.create_collection(name="study_notes2")

# Step 3: Prepare data for insertion
ids = [f"chunk-{i}" for i in range(len(text_chunks))]

collection.add(
    documents=text_chunks,     # Original chunks
    embeddings=embeddings,     # Generated embeddings
    ids=ids                    # Unique IDs
)

print(f"✅ Stored {len(text_chunks)} chunks in ChromaDB collection.")


✅ Stored 101 chunks in ChromaDB collection.


# ****List Available Gemini Models****
Prints the list of available Gemini models that support content generation.

In [12]:
from google.generativeai import list_models

for model in list_models():
    if "generateContent" in model.supported_generation_methods:
        print(f"✅ {model.name} supports content generation")


✅ models/gemini-1.0-pro-vision-latest supports content generation
✅ models/gemini-pro-vision supports content generation
✅ models/gemini-1.5-pro-latest supports content generation
✅ models/gemini-1.5-pro-001 supports content generation
✅ models/gemini-1.5-pro-002 supports content generation
✅ models/gemini-1.5-pro supports content generation
✅ models/gemini-1.5-flash-latest supports content generation
✅ models/gemini-1.5-flash-001 supports content generation
✅ models/gemini-1.5-flash-001-tuning supports content generation
✅ models/gemini-1.5-flash supports content generation
✅ models/gemini-1.5-flash-002 supports content generation
✅ models/gemini-1.5-flash-8b supports content generation
✅ models/gemini-1.5-flash-8b-001 supports content generation
✅ models/gemini-1.5-flash-8b-latest supports content generation
✅ models/gemini-1.5-flash-8b-exp-0827 supports content generation
✅ models/gemini-1.5-flash-8b-exp-0924 supports content generation
✅ models/gemini-2.5-pro-exp-03-25 supports con

# ****Ask Questions Using Gemini Pro****
Sends a context + question to the Gemini model and generates a relevant, concise answer.

In [13]:

import google.generativeai as genai


# 🔑 Fetch the API key securely

GOOGLE_API_KEY = user_secrets.get_secret("GOOGLE_API_KEY")


# ✅ Configure Gemini with your actual key
genai.configure(api_key=GOOGLE_API_KEY)

# Load the model
model = genai.GenerativeModel("gemini-1.5-pro-latest")

# Sample input
question = "What is an LLM?"
context = """Generative AI, and LLMs specifically, is a General Purpose Technology..."""

# Generate response
response = model.generate_content(
    f"""Answer the following question based on the context below:

Context:
{context}

Question:
{question}

Answer in a short and clear way."""
)

# Output
print("📘 Answer:", response.text.strip())


📘 Answer: LLM stands for Large Language Model.


Blog 
https://medium.com/@guravsiddhi683/studybuddy-a-rag-powered-ai-study-assistant-using-gemini-and-chromadb-071c170a0430[](http://)]
