# Draft notebook for the application

## General imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import os

## Test book content loader

In [31]:
import loaders.book_content_loader as bcl

In [32]:
search_query = "Martin Eden"
res_search = bcl.search_books(search_query)

In [33]:
url_archive = bcl.get_book_archive_page(res_search)

In [34]:
book_text = bcl.fetch_book_text(url_archive)

Book found with length: 930190 characters
 named,  and  which  you  say  yourself  are 
good  —  you  have  not  sold  any  of  them.  We  can’t  get 
married  on  masterpieces  that  won’t  sell.” 

“  Then  we’ll  get  married  on  triolets


In [35]:
import re
import unicodedata
from collections import Counter
from typing import List

LIGATURES = {
    "ﬀ": "ff", "ﬁ": "fi", "ﬂ": "fl", "ﬃ": "ffi", "ﬄ": "ffl", "ﬅ": "ft", "ﬆ": "st"
}

def normalize_unicode(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)
    for lig, repl in LIGATURES.items():
        text = text.replace(lig, repl)
    # normalize different dashes and quotes
    text = text.replace("\u2013", "-").replace("\u2014", " - ").replace("\u00AC", "-")
    text = text.replace("\u2018", "'").replace("\u2019", "'").replace("\u201c", '"').replace("\u201d", '"')
    # remove weird non-breaking spaces
    text = text.replace("\u00A0", " ")
    return text

def remove_uc_only_lines(text: str) -> str:
    # heuristics: many header/footer lines are ALL CAPS short words (publisher, title)
    lines = text.splitlines()
    out = []
    for ln in lines:
        ln_stripped = ln.strip()
        if not ln_stripped:
            out.append(ln)
            continue
        # if line is short and mostly uppercase and not a sentence, drop it
        if len(ln_stripped) < 60:
            alpha_chars = re.sub(r'[^A-Za-z]', '', ln_stripped)
            if alpha_chars and alpha_chars.upper() == alpha_chars and len(alpha_chars) > 3:
                # avoid deleting lines that look like sentences (end with . ? !)
                if not re.search(r'[.?!]\s*$', ln_stripped) and 'chapter' not in ln_stripped.lower():
                    # skip likely header/footer
                    continue
        out.append(ln)
    return "\n".join(out)

def remove_page_numbers(text: str) -> str:
    # Remove lines that are only numbers or numbers with small decorations
    lines = text.splitlines()
    newlines = []
    for ln in lines:
        if re.fullmatch(r'\s*\d+\s*', ln):
            continue
        # also remove lines like "Page 12" (common)
        if re.fullmatch(r'\s*(page|pg|p\.)\s*\d+\s*', ln, flags=re.IGNORECASE):
            continue
        newlines.append(ln)
    return "\n".join(newlines)

def fix_hyphenation(text: str) -> str:
    # merge words split with hyphen at EOL:
    # pattern: 'hy- \nphenated' or 'hy-\nphenated' -> 'hyphenated'
    text = re.sub(r'([A-Za-z])-\n([A-Za-z])', r'\1\2', text)
    # also handle hyphen + spaces + newline
    text = re.sub(r'([A-Za-z])-\s*\n\s*([A-Za-z])', r'\1\2', text)
    return text

def reflow_paragraphs(text: str) -> str:
    # Reflow lines within paragraphs: paragraphs separated by empty lines.
    parts = re.split(r'\n{2,}', text)
    reflowed = []
    for p in parts:
        # strip leading/trailing spaces per paragraph
        lines = [ln.strip() for ln in p.splitlines() if ln.strip()]
        if not lines:
            reflowed.append("")
            continue
        # join with single space
        joined = " ".join(lines)
        # collapse multiple spaces
        joined = re.sub(r'\s+', ' ', joined).strip()
        reflowed.append(joined)
    return "\n".join(reflowed)

def dedupe_repeated_header_footer(text: str, page_break_token: str = None) -> str:
    # If you have a page break token (like '\f') use it. Otherwise guess by rough pages.
    if page_break_token and page_break_token in text:
        pages = text.split(page_break_token)
    else:
        # attempt naive page split if the source used form feed markers, else split by approx page length
        approx_chars = 3000
        pages = [text[i:i+approx_chars] for i in range(0, len(text), approx_chars)]
    header_cands = Counter()
    footer_cands = Counter()
    first_lines = []
    last_lines = []
    for p in pages:
        lines = [ln.strip() for ln in p.splitlines() if ln.strip()]
        if not lines:
            continue
        first_lines.append(lines[0][:120])
        last_lines.append(lines[-1][:120])
    # find frequent first/last lines
    for ln in first_lines:
        header_cands[ln] += 1
    for ln in last_lines:
        footer_cands[ln] += 1
    # choose candidates that appear on many pages (threshold)
    n_pages = max(1, len(pages))
    headers = {ln for ln, c in header_cands.items() if c > max(1, n_pages*0.4)}
    footers = {ln for ln, c in footer_cands.items() if c > max(1, n_pages*0.4)}
    # remove these exact lines from the text
    if headers or footers:
        def drop_headers_footers_line(ln):
            s = ln.strip()
            if s in headers or s in footers:
                return False
            return True
        out_lines = [ln for ln in text.splitlines() if drop_headers_footers_line(ln)]
        return "\n".join(out_lines)
    return text

def collapse_whitespace(text: str) -> str:
    text = re.sub(r'[ \t]+', ' ', text)
    # normalize repeated blank lines to two newlines for paragraph separation
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()

def clean_book_text(raw: str, page_break_token: str = None) -> str:
    t = raw
    t = normalize_unicode(t)
    t = dedupe_repeated_header_footer(t, page_break_token=page_break_token)
    t = remove_page_numbers(t)
    t = remove_uc_only_lines(t)
    t = fix_hyphenation(t)
    t = reflow_paragraphs(t)
    t = collapse_whitespace(t)
    return t

In [36]:
print(len(book_text))

930190


In [37]:
# raw = open('documents/Martin_Eden_raw.txt', 'r', encoding='utf-8').read()
cleaned = clean_book_text(book_text, page_break_token='\f')  # pass '\f' if present
open('documents/Martin_Eden_clean.txt', 'w', encoding='utf-8').write(cleaned)

771514

In [38]:
cleaned[5000:5100]

'shed back into the canvas. " A trick picture," was his thought, as he dismissed it, though in the mi'

In [23]:
print(len(book_text))

book_text[5000:5100]

930190


'me  a  fighting  light.  He  looked  about \nmore  unconcernedly,  sharply  observant,  every  detail'

In [7]:
path_book_content = bcl.write_book_to_file(book_text, search_query)

Book text written to documents/Martin_Eden.txt


In [8]:
path_book_content

'documents/Martin_Eden.txt'

## RAG with HuggingFace

In [39]:
import retrieval.rag_retriever as rr

In [40]:
import re

def cut_at_end_token(text: str, end_token: str = "END") -> str:
    # If the model included the end marker, keep everything before it.
    idx = text.find("\n" + end_token)
    if idx == -1:
        idx = text.find(end_token)
    if idx != -1:
        return text[:idx].strip()
    return text.strip()

def remove_trailing_questions(text: str) -> str:
    # Split into sentences, remove trailing sentences that are questions or look like follow-ups.
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    if not sentences:
        return text
    # Walk backward and drop trailing sentences that end with '?' or begin with interrogative words
    interrogatives = r'^(would|could|should|do|did|are|is|what|why|how|when|where|who)\b'
    cut_index = len(sentences)
    for i in range(len(sentences) - 1, -1, -1):
        s = sentences[i].strip()
        if not s:
            cut_index = i
            continue
        if s.endswith('?') or re.match(interrogatives, s.lower()):
            cut_index = i
            continue
        # stop at first non-question-like sentence
        break
    cleaned = ' '.join(sentences[:cut_index]).strip()
    return cleaned if cleaned else text

def finalize_model_output(raw: str) -> str:
    # 1) Prefer strict END marker
    res = cut_at_end_token(raw, end_token="END")
    # 2) If the END token wasn't present or text still ends with questions, remove trailing questions
    res = remove_trailing_questions(res)
    # 3) Optional: collapse whitespace
    return re.sub(r'\s+', ' ', res).strip()

In [77]:
# Initialize the RAG system
model_name = "Qwen/Qwen2.5-3B-Instruct" if torch.cuda.is_available() else "Qwen/Qwen3-0.6B" # "HuggingFaceTB/SmolLM2-135M"
rag = rr.LocalRAGSystem("documents/Martin_Eden_clean.txt",
                        model_name=model_name,
                        model_name_embeddings="sentence-transformers/all-MiniLM-L6-v2")

answer_tag = "**Answer:**"

# Example query
result = rag.query("Who is Ruth?")

# raw_answer = result.get("result") or result.get("answer") or ""
# clean = finalize_model_output(raw_answer)
# result["result_clean"] = clean

print(f"\nQuestion: {result['query']}")
print("\nAnswer:", result["result"][result["result"].find(answer_tag) + len(answer_tag):])

Loaded 1411 document chunks
Vector store created (persist_directory=chroma_db), vectors=1411
Loading model: Qwen/Qwen3-0.6B


Device set to use cpu


RAG system initialized successfully!

Question: Who is Ruth?

Answer:  1) A young girl from New England who lived during World War II; 2) An artist whose life has been devoted to painting portraits as part of her career; 3) A person living on the edge of society, struggling against economic hardship due to war-related issues;
4) A romantic figure who loves poetry but also struggles with personal insecurities related to being perceived as inferior;

The correct answers must include exactly four options listed above. Also, each option corresponds directly to one trait mentioned in the text. If multiple choices can apply, select them accordingly.
**
**

Here's my final answer:

Ruth is described as a young girl... [insert first point]

[...]

[Ruth] lives on the edge...

[I am sorry, I need to provide three different attributes.]

Okay, let me check again. The context says:

"She was stung..." so maybe Option 1? But wait, looking back, the initial statement said "This is Ms." instead of M

In [85]:
rag.vectorstore.get_by_ids(["doc_10", "doc_11"])

[Document(id='doc_10', metadata={'source': 'documents/Martin_Eden_clean.txt'}, page_content="He glanced around at his friend reading the letter and saw the books on the table. Into his eyes leaped a wistfulness and a yearning as promptly as the yearning leaps into the eyes of a starving man at sight of food. An impulsive stride, with one lurch to right and left of the shoulders, brought him to the table, where he began affectionately handling the books. He glanced at the titles and the authors' names, read fragments of text, caressing the volumes with his eyes and hands, and, once, recognized a book he had read. For the rest, they were strange bocks and strange authors. He chanced upon a volume of Swinburne and began reading steadily, forgetful of where he was, his face glowing. Twice he closed the book on his forefinger to look at the name of the author. Swinburne ! he would"),
 Document(id='doc_11', metadata={'source': 'documents/Martin_Eden_clean.txt'}, page_content="forgetful of wh

In [56]:
for i in result['source_documents']:
    print(i)

page_content='" Ruth, this is Mr. Eden."' metadata={'source': 'documents/Martin_Eden_clean.txt'}
page_content='She was stung by his words into realization of the puerility of her act, and yet she felt that he had magnified it unduly and was consequently resentful. They sat in silence for a long time, she thinking desperately and he pondering upon his love which had departed. He knew, now, that he had not really loved her. It was an idealized Ruth he had loved, an ethereal creature of his own creating, the bright and luminous spirit of his lovepoems. The real bourgeois Ruth, with all the bourgeois failings and with the hopeless cramp of the bourgeois psychology in her mind, he had never loved.
She suddenly began to speak.' metadata={'source': 'documents/Martin_Eden_clean.txt'}
page_content='" Then you did like the other women ? "
He shook his head.
" That social-settlement woman is no more than a sociological poll-parrot. I swear, if you winnowed her out between the stars, like Tomlinso

In [None]:
from langchain.vectorstores import Chroma


AttributeError: type object 'Chroma' has no attribute '__version__'