# Draft notebook for the application

## General imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import torch


In [3]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
os.chdir(parent_dir)
sys.path.append(parent_dir)

## Test book content loader

In [4]:
import src.loaders.book_content_loader as bcl

In [11]:
search_query = "The Murder of Roger Ackroyd"
search_query = "The Big Sleep"
search_query = "Martin Eden"
search_query = "Jules Verne"

res_search = bcl.search_books(search_query)

In [12]:
candidates = bcl.get_book_candidates(res_search)
print(f'candidates: {candidates}')

candidates: [{'label': 'Le Tour du Monde en Quatre-Vingts Jours (1872) by Jules Verne', 'ia_id': 'tourofworldineig00vern_1', 'title': 'Le Tour du Monde en Quatre-Vingts Jours', 'value': 'tourofworldineig00vern_1'}, {'label': 'Vingt mille lieues sous les mers (1870) by Jules Verne', 'ia_id': 'cihm_77498', 'title': 'Vingt mille lieues sous les mers', 'value': 'cihm_77498'}, {'label': 'De la terre à la lune (1865) by Jules Verne', 'ia_id': 'baltimoregunclub00vern', 'title': 'De la terre à la lune', 'value': 'baltimoregunclub00vern'}, {'label': 'Voyage au Centre de la Terre (1867) by Jules Verne', 'ia_id': 'voyageaucentrede0000vern_k5b6', 'title': 'Voyage au Centre de la Terre', 'value': 'voyageaucentrede0000vern_k5b6'}, {'label': "L'Île mystérieuse (1870) by Jules Verne", 'ia_id': 'mysteriousisland1884vern', 'title': "L'Île mystérieuse", 'value': 'mysteriousisland1884vern'}, {'label': 'Cinq semaines en ballon (1867) by Jules Verne', 'ia_id': 'fiveweeksinballo00vern', 'title': 'Cinq semain

In [13]:
candidates

[{'label': 'Le Tour du Monde en Quatre-Vingts Jours (1872) by Jules Verne',
  'ia_id': 'tourofworldineig00vern_1',
  'title': 'Le Tour du Monde en Quatre-Vingts Jours',
  'value': 'tourofworldineig00vern_1'},
 {'label': 'Vingt mille lieues sous les mers (1870) by Jules Verne',
  'ia_id': 'cihm_77498',
  'title': 'Vingt mille lieues sous les mers',
  'value': 'cihm_77498'},
 {'label': 'De la terre à la lune (1865) by Jules Verne',
  'ia_id': 'baltimoregunclub00vern',
  'title': 'De la terre à la lune',
  'value': 'baltimoregunclub00vern'},
 {'label': 'Voyage au Centre de la Terre (1867) by Jules Verne',
  'ia_id': 'voyageaucentrede0000vern_k5b6',
  'title': 'Voyage au Centre de la Terre',
  'value': 'voyageaucentrede0000vern_k5b6'},
 {'label': "L'Île mystérieuse (1870) by Jules Verne",
  'ia_id': 'mysteriousisland1884vern',
  'title': "L'Île mystérieuse",
  'value': 'mysteriousisland1884vern'},
 {'label': 'Cinq semaines en ballon (1867) by Jules Verne',
  'ia_id': 'fiveweeksinballo00ver

In [10]:
url_archive = bcl.get_book_archive_page(res_search)

In [11]:
book_text = bcl.fetch_book_text(url_archive)

In [12]:
print(len(book_text))

930190


In [13]:
cleaned = bcl.clean_book_text(book_text, page_break_token='\f')  # pass '\f' if present

path_cleaned_book = f'src/documents/{search_query.replace(" ", "_")}_clean.txt'
open(path_cleaned_book, 'w', encoding='utf-8').write(cleaned)

771514

## RAG with HuggingFace

In [14]:
import src.retrieval.rag_retriever as rr
from src.utils.config_loader import load_config

config = load_config()

In [16]:
import os
path_cleaned_book = os.path.join(config['paths']['documents'],
                                 f"{search_query.replace(' ', '_')}_clean.txt")
path_cleaned_book = "C:\\Users\\Nicolas Cotoni\\Work\\ReadRecall\\src\\documents\\baltimoregunclub00vern_clean.txt"

In [8]:
rag = rr.LocalRAGSystem(path_documents=path_cleaned_book,
                        path_token_hf=config['paths']['hf_token'],
                        path_custom_prompt=config['paths']['custom_prompt'],
                        model_name=config['models']['llm'],
                        model_name_embeddings=config['models']['embeddings'],
                        model_name_reranker=config['models']['reranker'])

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


In [13]:
# Martin Eden tests
result = rag.query("Who is Ruth?", chapter_max=5)
# result = rag.query("What is Ruth's full name?")
# result = rag.query("What is Martin final destination and what happened to him?", chapter_max=50, debug_print=True)
# result = rag.query("Did Martin drown to death?", chapter_max=50, debug_print=True)

# The Murder of Roger Ackroyd tests
# result = rag.query("What event prompts Dr. Sheppard's involvement in the story?", chapter_max=2)
# result = rag.query("Who is Roger Ackroyd?", chapter_max=3)
# result = rag.query("What unique role does Hercule Poirot play in the village at the beginning?", chapter_max=4)

# The Big Sleep tests
# result = rag.query("What event prompts Dr. Sheppard's involvement in the story?", chapter_max=2)

print(result['result'])

Based on the provided context, Ruth is a woman who is being talked about by Mr. Eden. She is described as a pale, ethereal creature with wide, spiritual blue eyes and a wealth of golden hair.


### Utilities

In [None]:
last_idx = rag.docs[0].metadata['chapter_index']
nb_char = 0
for doc in rag.docs:
    if doc.metadata['chapter_index'] != last_idx:
        print(f"Total char for chapter {last_idx}: {nb_char}")
        nb_char = 0
        last_idx = doc.metadata['chapter_index']
    # print(f"Chapter_i {doc.metadata['chapter_index']}: {len(doc.page_content)}")
    nb_char += len(doc.page_content)
# Print the last chapter's total
print(f"Total char for chapter {last_idx}: {nb_char}")