# Draft notebook for the application

## General imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import os

## Test book content loader

In [3]:
import loaders.book_content_loader as bcl

In [4]:
search_query = "Martin Eden"
res_search = bcl.search_books(search_query)

In [5]:
url_archive = bcl.get_book_archive_page(res_search)

In [6]:
book_text = bcl.fetch_book_text(url_archive)

Book found with length: 930190 characters
 usually  a  pot  of  them,  cooked  and  ready 
at  hand,  for  they  took  the  place  of  butter  on  his  bread. 
Occasionally  he  graced  his  table  with  a  piece  of  round- 
steak,  or  with


In [7]:
import re
import unicodedata
from collections import Counter
from typing import List

LIGATURES = {
    "ﬀ": "ff", "ﬁ": "fi", "ﬂ": "fl", "ﬃ": "ffi", "ﬄ": "ffl", "ﬅ": "ft", "ﬆ": "st"
}

def normalize_unicode(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)
    for lig, repl in LIGATURES.items():
        text = text.replace(lig, repl)
    # normalize different dashes and quotes
    text = text.replace("\u2013", "-").replace("\u2014", " - ").replace("\u00AC", "-")
    text = text.replace("\u2018", "'").replace("\u2019", "'").replace("\u201c", '"').replace("\u201d", '"')
    # remove weird non-breaking spaces
    text = text.replace("\u00A0", " ")
    return text

def remove_uc_only_lines(text: str) -> str:
    # heuristics: many header/footer lines are ALL CAPS short words (publisher, title)
    lines = text.splitlines()
    out = []
    for ln in lines:
        ln_stripped = ln.strip()
        if not ln_stripped:
            out.append(ln)
            continue
        # if line is short and mostly uppercase and not a sentence, drop it
        if len(ln_stripped) < 60:
            alpha_chars = re.sub(r'[^A-Za-z]', '', ln_stripped)
            if alpha_chars and alpha_chars.upper() == alpha_chars and len(alpha_chars) > 3:
                # avoid deleting lines that look like sentences (end with . ? !)
                if not re.search(r'[.?!]\s*$', ln_stripped) and 'chapter' not in ln_stripped.lower():
                    # skip likely header/footer
                    continue
        out.append(ln)
    return "\n".join(out)

def remove_page_numbers(text: str) -> str:
    # Remove lines that are only numbers or numbers with small decorations
    lines = text.splitlines()
    newlines = []
    for ln in lines:
        if re.fullmatch(r'\s*\d+\s*', ln):
            continue
        # also remove lines like "Page 12" (common)
        if re.fullmatch(r'\s*(page|pg|p\.)\s*\d+\s*', ln, flags=re.IGNORECASE):
            continue
        newlines.append(ln)
    return "\n".join(newlines)

def fix_hyphenation(text: str) -> str:
    # merge words split with hyphen at EOL:
    # pattern: 'hy- \nphenated' or 'hy-\nphenated' -> 'hyphenated'
    text = re.sub(r'([A-Za-z])-\n([A-Za-z])', r'\1\2', text)
    # also handle hyphen + spaces + newline
    text = re.sub(r'([A-Za-z])-\s*\n\s*([A-Za-z])', r'\1\2', text)
    return text

def reflow_paragraphs(text: str) -> str:
    # Reflow lines within paragraphs: paragraphs separated by empty lines.
    parts = re.split(r'\n{2,}', text)
    reflowed = []
    for p in parts:
        # strip leading/trailing spaces per paragraph
        lines = [ln.strip() for ln in p.splitlines() if ln.strip()]
        if not lines:
            reflowed.append("")
            continue
        # join with single space
        joined = " ".join(lines)
        # collapse multiple spaces
        joined = re.sub(r'\s+', ' ', joined).strip()
        reflowed.append(joined)
    return "\n".join(reflowed)

def dedupe_repeated_header_footer(text: str, page_break_token: str = None) -> str:
    # If you have a page break token (like '\f') use it. Otherwise guess by rough pages.
    if page_break_token and page_break_token in text:
        pages = text.split(page_break_token)
    else:
        # attempt naive page split if the source used form feed markers, else split by approx page length
        approx_chars = 3000
        pages = [text[i:i+approx_chars] for i in range(0, len(text), approx_chars)]
    header_cands = Counter()
    footer_cands = Counter()
    first_lines = []
    last_lines = []
    for p in pages:
        lines = [ln.strip() for ln in p.splitlines() if ln.strip()]
        if not lines:
            continue
        first_lines.append(lines[0][:120])
        last_lines.append(lines[-1][:120])
    # find frequent first/last lines
    for ln in first_lines:
        header_cands[ln] += 1
    for ln in last_lines:
        footer_cands[ln] += 1
    # choose candidates that appear on many pages (threshold)
    n_pages = max(1, len(pages))
    headers = {ln for ln, c in header_cands.items() if c > max(1, n_pages*0.4)}
    footers = {ln for ln, c in footer_cands.items() if c > max(1, n_pages*0.4)}
    # remove these exact lines from the text
    if headers or footers:
        def drop_headers_footers_line(ln):
            s = ln.strip()
            if s in headers or s in footers:
                return False
            return True
        out_lines = [ln for ln in text.splitlines() if drop_headers_footers_line(ln)]
        return "\n".join(out_lines)
    return text

def collapse_whitespace(text: str) -> str:
    text = re.sub(r'[ \t]+', ' ', text)
    # normalize repeated blank lines to two newlines for paragraph separation
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()

def clean_book_text(raw: str, page_break_token: str = None) -> str:
    t = raw
    t = normalize_unicode(t)
    t = dedupe_repeated_header_footer(t, page_break_token=page_break_token)
    t = remove_page_numbers(t)
    t = remove_uc_only_lines(t)
    t = fix_hyphenation(t)
    t = reflow_paragraphs(t)
    t = collapse_whitespace(t)
    return t

In [36]:
print(len(book_text))

930190


In [8]:
# raw = open('documents/Martin_Eden_raw.txt', 'r', encoding='utf-8').read()
cleaned = clean_book_text(book_text, page_break_token='\f')  # pass '\f' if present
open('documents/Martin_Eden_clean.txt', 'w', encoding='utf-8').write(cleaned)

771514

In [7]:
path_book_content = bcl.write_book_to_file(book_text, search_query)

Book text written to documents/Martin_Eden.txt


In [8]:
path_book_content

'documents/Martin_Eden.txt'

## RAG with HuggingFace

In [3]:
import retrieval.rag_retriever as rr

In [6]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
rag = rr.LocalRAGSystem("documents/Martin_Eden_clean.txt",
                        model_name=model_name)

Identified 49 chapters/sections.
Created 1122 chunks from 49 chapters.
Creating embeddings and vector store...
Vector store created with 1122 chunks.
Loading model: meta-llama/Meta-Llama-3-8B-Instruct


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


Model loaded successfully with 4-bit quantization!
RAG system initialized successfully!


In [7]:
last_idx = rag.docs[0].metadata['chapter_index']
nb_char = 0
for doc in rag.docs:
    if doc.metadata['chapter_index'] != last_idx:
        print(f"Total char for chapter {last_idx}: {nb_char}")
        nb_char = 0
        last_idx = doc.metadata['chapter_index']
    # print(f"Chapter_i {doc.metadata['chapter_index']}: {len(doc.page_content)}")
    nb_char += len(doc.page_content)
# Print the last chapter's total
print(f"Total char for chapter {last_idx}: {nb_char}")

Total char for chapter 0: 673
Total char for chapter 1: 25523
Total char for chapter 2: 22391
Total char for chapter 3: 15606
Total char for chapter 4: 1639
Total char for chapter 5: 9251
Total char for chapter 6: 12479
Total char for chapter 7: 16988
Total char for chapter 8: 23548
Total char for chapter 9: 18517
Total char for chapter 10: 20406
Total char for chapter 11: 13656
Total char for chapter 12: 15697
Total char for chapter 13: 10918
Total char for chapter 14: 22309
Total char for chapter 15: 26319
Total char for chapter 16: 17250
Total char for chapter 17: 16768
Total char for chapter 18: 14733
Total char for chapter 19: 8747
Total char for chapter 20: 1789
Total char for chapter 21: 12943
Total char for chapter 22: 15396
Total char for chapter 23: 13125
Total char for chapter 24: 14528
Total char for chapter 25: 13856
Total char for chapter 26: 19089
Total char for chapter 27: 20131
Total char for chapter 28: 23267
Total char for chapter 29: 30427
Total char for chapter 30:

In [7]:
rag.index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x00000279592D13E0> >

In [10]:
result = rag.query("Who is Ruth?", chapter_max=5)
# result = rag.query("What is Ruth's full name?")
print(result)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Retrieved indices: [[ 12  95  88  94  73  30  72  29 106  16]]
Relevant context for question:
" Ruth, this is Mr. Eden."

...

"Rath." He had not thought a simple sound could be so beautiful. It delighted his ear, and he grew intoxicated with the repetition of it. "Ruth." It was a talisman, a magic word to conjure with. Each time he murmured it, her face shimmered before him, suffusing the foul wall with a golden radiance. This radiance did not stop at the wall. It extended on into infinity, and through its golden depths his soul went questing after hers. The best that was in him was pouring out in splendid flood. The very thought of her ennobled and purified him, made him better, and made him want to be better. This was new to him. He had never known women who had made him better. They had always had the counter effect of making him beastly. He did not

...

She nodded her head resignedly.
"His eyes was pretty shiny," she confessed; "and he didn't have no collar, though he went away w

In [6]:
print(result['result'])

Based on the provided context, Ruth's full name is not explicitly mentioned. However, it is mentioned that Mr. Eden addresses her as "Ruth, this is Mr. Eden."
