In [1]:
import fitz
import os
from dotenv import load_dotenv
import numpy as np
import json
from google import generativeai as genai
from vector_store import VectorStore
from sentence_transformers import SentenceTransformer



In [2]:
pdf_path = "pdfs/sustainability_concept_v1.pdf"

In [3]:


# Läs in .env-filen
load_dotenv()

# Hämta API-nyckeln från miljövariabler
api_key = os.getenv("API_KEY")

# Skapa klient
genai.configure(api_key=api_key)


## Extracting Text from a PDF File
To implement RAG, we first need a source of textual data. In this case, we extract text from a PDF file using the fitz library. Once we have the extracted text, we divide it into smaller, sentence chunks to improve retrieval accuracy then creating enbeddings and saving them to vector store.

In [4]:
# Extracting the text, chankin it and create embeddings to save them in vectore store 
def extract_text_from_pdf(pdf_path, max_pages=None):
    mypdf = fitz.open(pdf_path)
    all_text = ""
    total_pages = mypdf.page_count
    pages_to_read = min(max_pages, total_pages) if max_pages is not None else total_pages

    for page_num in range(pages_to_read):
        page = mypdf[page_num]
        text = page.get_text("text")
        all_text += text

    return all_text

    # Load model (CPU mode)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
store = VectorStore()

# Extract and chunk PDF

text = extract_text_from_pdf(pdf_path, max_pages=None)
sentences = [s.strip() for s in text.split(". ") if s.strip()]

# Create data directory if not exists
os.makedirs("data", exist_ok=True)

# Encode with batching and progress bar
batch_size = 8
print(f"Encoding {len(sentences)} sentences (CPU, batch size = {batch_size})...")

embeddings = embedding_model.encode(
    sentences,
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_numpy=True
)

# Store embeddings + metadata
for i, (sentence, embedding) in enumerate(zip(sentences, embeddings)):
    metadata = {"sentence_index": i}
    store.add_item(sentence, embedding, metadata)

# Save vector store
store.save("data/embeddings.parquet")
print(f"✅ Stored {len(sentences)} embeddings to 'data/embeddings.parquet'.")


Encoding 323 sentences (CPU, batch size = 8)...


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

✅ Stored 323 embeddings to 'data/embeddings.parquet'.


In [6]:
import textwrap

# Load vector store
store = VectorStore()
store.load("data/embeddings.parquet")

# Set up Gemini model and chat session
model = genai.GenerativeModel("gemini-1.5-flash")
chat = model.start_chat()

# System prompt
system_prompt = """I will ask you a question, and I want you to respond 
based only on the context I provide — do not use any other information.
Do not write any poems or songs as responce.
If there isn't enough information in the context to answer the question,
say "I don't know." Do not try to guess.
Respond clearly and structure your answer into well-organized paragraphs."""

# Chat loop
if __name__ == "__main__":
    print("*** Gemini RAG Chat ***")
    print("Type <q> to exit chat.")

    while True:
        user_query = input("\nUser: ")
        if user_query.strip().lower() == "q":
            break

        # Embed query and retrieve top-k context
        query_embedding = embedding_model.encode(user_query)
        results = store.semantic_search(query_embedding, k=5)
        context = "\n".join([item["text"] for item in results])

        # Build final prompt
        full_prompt = f"{system_prompt}\n\nThe question is:\n{user_query}\n\nHere is the context:\n{context}"

        # Get response from Gemini
        response = chat.send_message(full_prompt)
        wrapped_response = textwrap.fill(response.text, width=100)
        
        print(f"\nQuestion: {user_query}")
        print("\nGemini:")
        print(wrapped_response)


*** Gemini RAG Chat ***
Type <q> to exit chat.

Question: why we think about sustainability?

Gemini:
We think about sustainability because technologies exist that could better satisfy societal demands
using alternative resources.  Our societal response to dwindling natural resources depends on our
current knowledge.  Past extraction decisions, along with predictions of future costs and prices,
influence current choices.  Additionally, pressure from strained natural resources and environmental
laws are driving society, and manufacturers in particular, to adopt environmentally friendly
solutions and transition to more sustainable production methods without sacrificing competitiveness.
Finally, global initiatives like the Sustainable Development Goals emphasize more sustainable
resource extraction, and research focuses on models like the circular economy that minimize resource
extraction and prioritize reuse, recycling, and sustainably managed renewable resources.
