In [1]:
from pypdf import PdfReader

def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_text(text, chunk_size=500, overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap
    )
    return splitter.split_text(text)


In [3]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")

def embed_chunks(chunks):
    return embedder.encode(chunks, convert_to_tensor=False)


  from .autonotebook import tqdm as notebook_tqdm





In [4]:
import faiss
import numpy as np

def create_faiss_index(embeddings):
    dim = embeddings[0].shape[0]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings))
    return index


In [5]:
def retrieve_top_chunks(query, chunks, index, embedder, top_k=5):
    query_emb = embedder.encode([query])[0]
    D, I = index.search(np.array([query_emb]), top_k)
    return [chunks[i] for i in I[0]]


In [9]:
import google.generativeai as genai

# AIzaSyA2Umg0zYQAqCIQxyPJYCD5cgZjqpodEZg
genai.configure(api_key="Your-Gemini_API")
model = genai.GenerativeModel("gemini-2.5-flash")

def generate_answer_with_gemini(query, context):
    prompt = f"""
You are a smart assistant. Based on the following document context, answer the question or generate a summary.

Context:
{context}

Task: {query}
"""
    response = model.generate_content(prompt)
    return response.text


In [10]:
# Load PDF
text = extract_text_from_pdf("Chapter+5+-+Join.pdf")

# Chunking
chunks = chunk_text(text)

# Embeddings
embeddings = embed_chunks(chunks)

# Indexing
index = create_faiss_index(embeddings)

# Ask a question or request a summary
query = "Summarize the document in bullet points"

# Retrieve
top_chunks = retrieve_top_chunks(query, chunks, index, embedder)

# Generate answer
context = "\n\n".join(top_chunks)
output = generate_answer_with_gemini(query, context)

print(output)


Here's a summary of the document in bullet points:

*   **Steps to Create a College ER Diagram:**
    *   Change the name of the schema.
    *   Use the "new table tool" to add tables to the drawing area.
    *   Click on each table to rename it and add columns (e.g., `Students` table and its columns).
    *   Add relationships between the tables.
    *   The document shows a "Complete ER diagram."
*   **Converting College ER Diagram to Schema:**
    *   Select "Forward Engineer" from the "Database" tab.
*   **College Database Entities (Tables) and Attributes:**
    *   `Students`: St_Id, St_name, Phone, Nat_id, Dept_Id
    *   `Instructor`: Inst_Id, Inst_name, Phone, Nat_id
    *   `Courses`: Crs_id, Crs_name, Crs_description, Inst_Id
    *   `Departments`: Dept_Id, Dept_name, Dept_Location
    *   `NationalId`: Nat_Id
    *   `Stud_Courses`
*   **Chapter 5: Join (Database Concepts):**
    *   **Chapter Content:** Join, Aggregate Functions, Grouping, Order by.
    *   **Join:** Used t

In [13]:
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from pypdf import PdfReader

# Embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Read PDF
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    return "\n".join(page.extract_text() or "" for page in reader.pages)

def extract_more_than_one_pdf(pdf_paths):
    all_text = ""
    for path in pdf_paths:
        pdf_text = extract_text_from_pdf(path)
        if pdf_text:
            all_text += f"\n\n### Content from: {path} ###\n\n{pdf_text}\n"
    return all_text

# Chunking
def split_text(text, chunk_size=500, overlap=100):
    return [
        text[i:i + chunk_size]
        for i in range(0, len(text), chunk_size - overlap)
    ]

# Main function with relevant chunk selection
def ask_pdf_question(pdf_path, question, top_k=5):
    genai.configure(api_key="Your-Gemini_API")
    model = genai.GenerativeModel("gemini-2.5-flash")

    if isinstance(pdf_path, list):
        pdf_text = extract_more_than_one_pdf(pdf_path)
    else:
        pdf_text = extract_text_from_pdf(pdf_path)

    chunks = split_text(pdf_text)

    # Compute relevance
    chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True)
    question_embedding = embedder.encode(question, convert_to_tensor=True)

    cosine_scores = util.cos_sim(question_embedding, chunk_embeddings)[0]
    top_indices = cosine_scores.argsort(descending=True)[:top_k]
    top_chunks = [chunks[i] for i in top_indices]

    # Build prompt
    context = "\n\n".join(top_chunks)
    prompt = f"""You are an intelligent assistant. Answer the question strictly based on the document context provided below.

Do not use any external knowledge.

If the answer is not in the context, respond with "The answer is not available in the provided document."

Be concise and clear.

Document Context:
{context}

Question:
{question}

"""

    response = model.generate_content(prompt)
    return response.text


In [16]:
answer = ask_pdf_question("Chapter+5+-+Join.pdf","what are Joins ?")
print(answer)

Join is used in case of retrieving information from more than one table.
