In [1]:
# -------------------------------
# PDF to RAG-ready embeddings
# -------------------------------

import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# 1️⃣ Load PDF and extract text
pdf_path = r"C:\Users\user\Downloads\dokumen.pub_complete-cat-care-manual-the-essential-practical-guide-to-all-aspects-of-caring-for-your-cat-illustrated-0756617421-9780756617424.pdf"  # Replace with your PDF
doc = fitz.open(pdf_path)

full_text = ""
for page in doc:
    full_text += page.get_text()

print(f"Extracted {len(full_text)} characters from PDF.")

# 2️⃣ Split text into chunks (to fit LLM input limits)
def chunk_text(text, max_words=200):
    sentences = text.split(". ")
    chunks = []
    chunk = ""
    for sentence in sentences:
        if len(chunk.split()) + len(sentence.split()) <= max_words:
            chunk += sentence + ". "
        else:
            chunks.append(chunk.strip())
            chunk = sentence + ". "
    if chunk:
        chunks.append(chunk.strip())
    return chunks

chunks = chunk_text(full_text)
print(f"Total chunks: {len(chunks)}")

# 3️⃣ Generate embeddings for each chunk
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embed_model.encode(chunks)
embeddings = np.array(embeddings).astype("float32")
print(f"Embeddings shape: {embeddings.shape}")

# 4️⃣ Store embeddings in FAISS vector database
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance
index.add(embeddings)
print(f"FAISS index contains {index.ntotal} vectors.")

# 5️⃣ Example: retrieve top 3 relevant chunks for a query
query = "What feed is best for cat with low milk production?"
query_emb = embed_model.encode([query]).astype("float32")

k = 3  # top 3
distances, indices = index.search(query_emb, k)

print("\nTop chunks relevant to query:\n")
for i, idx in enumerate(indices[0]):
    print(f"Chunk {i+1}:")
    print(chunks[idx][:500], "...")  # show first 500 chars
    print("-" * 80)


Extracted 308926 characters from PDF.
Total chunks: 269




Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Embeddings shape: (269, 384)
FAISS index contains 269 vectors.

Top chunks relevant to query:

Chunk 1:
You will 
not find a fussy eater in a 
household where there is 
more than one cat. 
'' vA. 
Overfeeding 
Feed your cat two to three 
small meals a day, following the 
manufacturer’s recommendations. Do 
not give many snacks between meals. 
Do not 
give a cat 
too many 
scraps 
WATER AND MILK 
A cat gets most of the moisture that 
it requires from its food, and many 
felines seem to drink little. However, 
you should make sure that fresh water 
is available at all times. If a cat is fed 
dry foo ...
--------------------------------------------------------------------------------
Chunk 2:
Sometimes a cat supplements 
its diet by catching and eating 
small prey animals, but this 
does not mean that it is hungry 
or that you can prevent it from 
hunting by feeding it more. A 
cat hunts through instinct, and 
even a well-fed pet may catch 
mice given the opportunity. 
Protein is essentia

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = "google/flan-t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Combine top chunks into a single prompt
context = " ".join([chunks[idx] for idx in indices[0]])
prompt = f"summarize or answer the question based on context: {context} Question: {query}"

inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
output_ids = model.generate(inputs["input_ids"], max_length=150)
answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("\nAnswer:\n", answer)


Loading weights:   0%|          | 0/190 [00:00<?, ?it/s]




Answer:
 57 FEEDING EQUIPMENT


In [1]:
# ====== PDF → Embeddings + Chunks.txt (EXTREME low-memory version - FIXED) ======
# For ~1–4 GB RAM environments

import os
import re
import numpy as np
import fitz  # PyMuPDF - pip install pymupdf
from sentence_transformers import SentenceTransformer
from tqdm import tqdm  # pip install tqdm (optional but recommended)

def clean_text(text: str) -> str:
    if not text:
        return ""
    text = re.sub(r'\n{2,}', '\n\n', text)
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'Page \d+( of \d+)?', '', text, flags=re.I)
    text = re.sub(r'^\s*[\dIVXLCDM]+\s*$', '', text, flags=re.M)
    text = text.replace('•', ' - ').replace('', ' - ')
    return text.strip()


def chunk_generator(text: str, max_chars: int = 500, overlap: int = 80):
    """
    Safe, forward-only chunking. No infinite loop risk.
    Skips tiny fragments from pdfplumber-style extraction noise.
    """
    if not text:
        return
    n = len(text)
    start = 0
    while start < n:
        end = min(start + max_chars, n)
        chunk = text[start:end].strip()
        if len(chunk) >= 60:  # skip very short junk
            yield chunk
        # Always advance by fixed amount
        start += max_chars - overlap
        if (max_chars - overlap) <= 0:
            break


# ────────────────────────────────────────────────
# MAIN PIPELINE
# ────────────────────────────────────────────────

pdf_path = r"C:\Users\user\Downloads\dokumen.pub_complete-cat-care-manual-the-essential-practical-guide-to-all-aspects-of-caring-for-your-cat-illustrated-0756617421-9780756617424.pdf"

if not os.path.isfile(pdf_path):
    print(f"File not found: {pdf_path}")
    exit(1)

# 1. Extract text (using faster PyMuPDF)
print("Extracting text...")
full_text = ""
try:
    doc = fitz.open(pdf_path)
    for page_num, page in enumerate(doc, 1):
        txt = page.get_text("text")
        if txt.strip():
            full_text += txt + "\n"
        else:
            print(f"Warning: page {page_num} no meaningful text")
    doc.close()
except Exception as e:
    print(f"PDF error: {e}")
    exit(1)

print(f"Extracted {len(full_text):,} characters")

cleaned_text = clean_text(full_text)
print(f"Cleaned: {len(cleaned_text):,} characters")

# 2. Quick sanity check: how many chunks will we actually get?
print("\nCounting chunks (sanity check)...")
chunk_count = sum(1 for _ in chunk_generator(cleaned_text))
print(f"Expected total chunks: {chunk_count}  ← should be ~400–800 for this book")

if chunk_count > 2000:
    print("WARNING: Chunk count too high → something still wrong with text. Aborting.")
    exit(1)

# 3. Load embedding model
print("\nLoading model...")
try:
    embed_model = SentenceTransformer("all-MiniLM-L6-v2")
except Exception as e:
    print(f"Model load failed: {e}")
    print("Try: pip install --upgrade sentence-transformers torch transformers")
    exit(1)

# 4. Prepare output
save_dir = "cat_care_manual_models"
os.makedirs(save_dir, exist_ok=True)

embed_path = os.path.join(save_dir, "embeddings.npy")
chunks_path = os.path.join(save_dir, "chunks.txt")

print("\nStarting low-memory encoding + streaming save...")

all_emb_parts = []
batch = []
BATCH_SIZE = 64           # Increased — safe with low chunk count
chunk_idx = 0

pbar = tqdm(desc="Encoding chunks", unit="chunk", total=chunk_count)

with open(chunks_path, "w", encoding="utf-8") as txt_file:
    for chunk in chunk_generator(cleaned_text, max_chars=500, overlap=80):
        chunk_idx += 1
        batch.append(chunk)

        # Stream write chunk
        txt_file.write(f"─ Chunk {chunk_idx} ({len(chunk)} chars) ─{'─'*40}\n")
        txt_file.write(chunk + "\n\n")

        if len(batch) >= BATCH_SIZE:
            try:
                batch_emb = embed_model.encode(
                    batch,
                    batch_size=len(batch),
                    convert_to_numpy=True,
                    normalize_embeddings=True,
                    show_progress_bar=False
                )
                all_emb_parts.append(batch_emb)
                print(f"Processed {chunk_idx} chunks")
            except MemoryError:
                print("OOM → reduce BATCH_SIZE to 32 and retry")
                exit(1)
            batch = []  # free memory
            pbar.update(len(batch))  # wait — actually update after append

    # Final partial batch
    if batch:
        batch_emb = embed_model.encode(
            batch,
            batch_size=len(batch),
            convert_to_numpy=True,
            normalize_embeddings=True,
            show_progress_bar=False
        )
        all_emb_parts.append(batch_emb)
        print(f"Processed final partial batch ({chunk_idx} total chunks)")
        pbar.update(len(batch))

pbar.close()

print(f"\nTotal chunks written: {chunk_idx}")

# 5. Save embeddings
if all_emb_parts:
    embeddings = np.vstack(all_emb_parts)
    np.save(embed_path, embeddings)
    print(f"Embeddings saved: {embed_path}  shape = {embeddings.shape}")
    print(f"Size on disk ≈ {embeddings.nbytes / 1e6:.1f} MB")
else:
    print("No embeddings created (empty text?)")
    embeddings = np.array([])

# 6. Info file
info_path = os.path.join(save_dir, "info.txt")
with open(info_path, "w", encoding="utf-8") as f:
    f.write(f"PDF: {os.path.basename(pdf_path)}\n")
    f.write(f"Chunks: {chunk_idx}\n")
    f.write(f"Embedding dim: {embeddings.shape[1] if embeddings.size > 0 else 'N/A'}\n")
    f.write("Model: all-MiniLM-L6-v2\n")
    f.write("Chunk size: ~500 chars + 80 overlap\n")
    f.write("Extraction: PyMuPDF\n")

print(f"\nDone! Files saved in:\n{os.path.abspath(save_dir)}")
print("   • embeddings.npy")
print("   • chunks.txt")
print("   • info.txt")

Extracting text...
Extracted 309,116 characters
Cleaned: 296,856 characters

Counting chunks (sanity check)...
Expected total chunks: 707  ← should be ~400–800 for this book

Loading model...




Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m



Starting low-memory encoding + streaming save...


Encoding chunks:   0%|          | 0/707 [00:02<?, ?chunk/s]

Processed 64 chunks


Encoding chunks:   0%|          | 0/707 [00:04<?, ?chunk/s]

Processed 128 chunks


Encoding chunks:   0%|          | 0/707 [00:07<?, ?chunk/s]

Processed 192 chunks


Encoding chunks:   0%|          | 0/707 [00:09<?, ?chunk/s]

Processed 256 chunks


Encoding chunks:   0%|          | 0/707 [00:12<?, ?chunk/s]

Processed 320 chunks


Encoding chunks:   0%|          | 0/707 [00:15<?, ?chunk/s]

Processed 384 chunks


Encoding chunks:   0%|          | 0/707 [00:18<?, ?chunk/s]

Processed 448 chunks


Encoding chunks:   0%|          | 0/707 [00:20<?, ?chunk/s]

Processed 512 chunks


Encoding chunks:   0%|          | 0/707 [00:22<?, ?chunk/s]

Processed 576 chunks


Encoding chunks:   0%|          | 0/707 [00:25<?, ?chunk/s]

Processed 640 chunks


Encoding chunks:   0%|          | 3/707 [00:29<1:54:39,  9.77s/chunk]

Processed 704 chunks
Processed final partial batch (707 total chunks)

Total chunks written: 707
Embeddings saved: cat_care_manual_models\embeddings.npy  shape = (707, 384)
Size on disk ≈ 1.1 MB

Done! Files saved in:
c:\Users\user\ML_Projects\RAG_BASED_LLM\cat_care_manual_models
   • embeddings.npy
   • chunks.txt
   • info.txt



