In [2]:
import os
import fitz  # PyMuPDF
import docx
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
from dotenv import load_dotenv

# ================================
# Step 1: Load environment variables and configure API
# ================================
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("GOOGLE_API_KEY not found in environment variables.")
genai.configure(api_key=api_key)

# ================================
# Step 2: Text Extraction Functions
# ================================
def read_pdf(file_path):
    doc = fitz.open(file_path)
    return "\n".join([page.get_text() for page in doc])

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def read_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def read_document(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return read_pdf(file_path)
    elif ext == ".docx":
        return read_docx(file_path)
    elif ext == ".txt":
        return read_txt(file_path)
    else:
        raise ValueError("Unsupported file type")

# ================================
# Step 3: Chunking Function
# ================================
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

# ================================
# Step 4: Load & Chunk Document
# ================================
file_path = "C:/Users/aitha/OneDrive/Desktop/Interesting_Topics.pdf"  # Change path here
text = read_document(file_path)
chunks = chunk_text(text)
print(f"✅ Total Chunks: {len(chunks)}")
print(f"🧩 Preview Chunk 1:\n{chunks[0][:300]}...")

# ================================
# Step 5: Generate Embeddings
# ================================
model = SentenceTransformer("all-MiniLM-L6-v2")

def get_embeddings(chunks):
    return model.encode(chunks, show_progress_bar=True)

embeddings = get_embeddings(chunks)
print(f"✅ Generated {len(embeddings)} embeddings, dimension: {embeddings.shape}")

# ================================
# Step 6: Build FAISS Index
# ================================
embedding_array = np.array(embeddings).astype("float32")
index = faiss.IndexFlatL2(embedding_array.shape[1])
index.add(embedding_array)
print(f"✅ FAISS index created with {index.ntotal} vectors.")

# ================================
# Step 7: Search Function
# ================================
def search_similar_chunks(query, index, chunks, model, k=5):
    query_embedding = model.encode([query]).astype("float32")
    distances, indices = index.search(query_embedding, k)
    return [chunks[i] for i in indices[0]]

user_query = "What is SmartDocQ and how does it work?"
top_chunks = search_similar_chunks(user_query, index, chunks, model, k=5)

print("\n=== Top Retrieved Chunks ===")
for i, chunk in enumerate(top_chunks):
    print(f"🔹 Chunk {i+1}:\n{chunk}\n")

# ================================
# Step 8: Ask Gemini with Context
# ================================
def ask_gemini_with_context(query, context_chunks=None):
    try:
        model_name = "gemini-2.0-flash"  # or "gemini-1.5-pro"
        model = genai.GenerativeModel(model_name=model_name)
        
        if context_chunks:
            context_text = "\n".join(context_chunks)
            prompt = f"""You are a helpful assistant. Use the following context to answer the question.

Context:
{context_text}

Question:
{query}

Answer:"""
        else:
            prompt = query
        
        response = model.generate_content(prompt)
        return response.text
    
    except Exception as e:
        return f"An error occurred in Gemini API: {str(e)}"

# Example Q&A
response = ask_gemini_with_context(user_query, top_chunks)
print("\n=== Gemini Response ===")
print(response)


SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (2129640420.py, line 62)

In [None]:
# Test the system with sample questions
sample_questions = [
    "What is the main topic of this document?",
    # "Can you summarize the key points?",
    # "What are the important concepts mentioned?"
]

print("🧪 Testing with sample questions:\n")
for q in sample_questions:
    print(f"❓ Question: {q}")
    top_chunks = search_similar_chunks(q, index, chunks, model, k=3)
    response = ask_gemini_with_context(q, top_chunks)
    print(f"🤖 Answer: {response}\n")
    print("-" * 50)


In [None]:
# import os
# import fitz  # PyMuPDF
# import docx
# import faiss
# import numpy as np
# from sentence_transformers import SentenceTransformer
# from openai import OpenAI
# from dotenv import load_dotenv

# # Step 1: Load environment variables and configure API
# load_dotenv()
# api_key = os.getenv("OPENAI_API_KEY")
# if not api_key:
#     raise ValueError("OPENAI_API_KEY not found in environment variables.")
# client = OpenAI(api_key=api_key)


# # Step 2: Text Extraction Functions
# def read_pdf(file_path):
#     doc = fitz.open(file_path)
#     return "\n".join([page.get_text() for page in doc])


# def read_docx(file_path):
#     doc = docx.Document(file_path)
#     return "\n".join([para.text for para in doc.paragraphs])


# def read_txt(file_path):
#     with open(file_path, "r", encoding="utf-8") as f:
#         return f.read()


# def read_document(file_path):
#     ext = os.path.splitext(file_path)[1].lower()
#     if ext == ".pdf":
#         return read_pdf(file_path)
#     elif ext == ".docx":
#         return read_docx(file_path)
#     elif ext == ".txt":
#         return read_txt(file_path)
#     else:
#         raise ValueError("Unsupported file type")


# # Step 3: Chunking Function (by words)
# def chunk_text(text, chunk_size=500, overlap=50):
#     words = text.split()
#     chunks = []
#     start = 0
#     while start < len(words):
#         end = min(start + chunk_size, len(words))
#         chunk = " ".join(words[start:end])
#         chunks.append(chunk)
#         start += chunk_size - overlap
#     return chunks


# # Load and chunk document
# file_path = "C:/Users/aitha/OneDrive/Desktop/api_key_test.docx"   # Change path as required
# text = read_document(file_path)
# chunks = chunk_text(text)
# print(f"✅ Total Chunks: {len(chunks)}")
# print(f"🧩 Preview Chunk 1:\n{chunks[0][:300]}...")


# # Step 4: Embeddings using SentenceTransformer
# model = SentenceTransformer("all-MiniLM-L6-v2")


# def get_embeddings(chunks):
#     return model.encode(chunks, show_progress_bar=True)


# embeddings = get_embeddings(chunks)
# print(f"✅ Generated {len(embeddings)} embeddings, dimension: {embeddings.shape}")


# # Step 5: Build FAISS index
# embedding_array = np.array(embeddings).astype("float32")
# index = faiss.IndexFlatL2(embedding_array.shape[1])
# index.add(embedding_array)
# print(f"✅ FAISS index created with {index.ntotal} vectors.")


# # Search function
# def search_similar_chunks(query, index, chunks, model, k=5):
#     query_embedding = model.encode([query]).astype("float32")
#     distances, indices = index.search(query_embedding, k)
#     return [chunks[i] for i in indices[0]]


# user_query = "What is SmartDocQ and how does it work?"
# top_chunks = search_similar_chunks(user_query, index, chunks, model, k=5)
# for i, chunk in enumerate(top_chunks):
#     print(f"🔹 Chunk {i+1}:\n{chunk}\n")


# # Step 6: OpenAI Q&A with context
# def ask_openai_with_context(query, context_chunks=None):
#     try:
#         if context_chunks:
#             context_text = "\n".join(context_chunks)
#             prompt = f"""You are a helpful assistant. Use the following context to answer the question.

# Context:
# {context_text}

# Question:
# {query}

# Answer:"""
#         else:
#             prompt = query

#         response = client.chat.completions.create(
#             model="gpt-4o-mini",  # or "gpt-4o"
#             messages=[
#                 {"role": "system", "content": "You are a helpful assistant."},
#                 {"role": "user", "content": prompt},
#             ],
#         )
#         return response.choices[0].message.content

#     except Exception as e:
#         return f"An error occurred in OpenAI API: {str(e)}"


# response = ask_openai_with_context("What is SmartDocQ?", top_chunks)
# print("\n=== OpenAI Response ===")
# print(response)

# # User asks a question:
# user_query = "What is SmartDocQ and how does it work?"  # <-- You can edit this question

# # Find the most relevant chunks for the question
# top_chunks = search_similar_chunks(user_query, index, chunks, model, k=5)

# # Display top matching chunks (optional for inspection)
# for i, chunk in enumerate(top_chunks):
#     print(f"🔹 Chunk {i+1}:\n{chunk}\n")

# # Ask OpenAI API with selected context chunks
# response = ask_openai_with_context(user_query, top_chunks)
# print("\n= OpenAI Response =")
# print(response)
