In [69]:
import re
def separate_chapters(text):

    #remove the chuck after the story
    index = text.index("*** END OF THE PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND ***")
    text = text[:index]

    # Define a regular expression pattern to match chapter titles
    # This pattern matches the word "CHAPTER" followed by any roman numeral and any title.
    chapter_pattern = re.compile(r'CHAPTER\s+[IVXLCDM]+\.\s+.*', re.IGNORECASE)
    
    # Find all matches of the chapter pattern in the text
    chapters = re.split(chapter_pattern, text)
    
    # The first item in the list will be the text before the first chapter, which we don't need
    if chapters:
        chapters = chapters[1:]


    # Optionally, we can also retrieve the chapter titles if needed
    chapter_titles = chapter_pattern.findall(text)

    # If there's a table of contents, let's cut the list in half to remove duplicates
    contents_pattern = re.compile(r'(Contents|CONTENTS)\s*(.*?)(CHAPTER I|CHAPTER 1|THE PREFACE)', re.DOTALL)
    contents_match = contents_pattern.search(text)
    if contents_match:
        chapters = chapters[len(chapters)//2:]
        chapter_titles = chapter_titles[len(chapter_titles)//2:]
    
    # Return a list of tuples where each tuple contains the chapter title and the chapter text
    return list(zip(chapter_titles, chapters))

In [70]:
#Test with alice 
with open("../books/alice.txt", 'r', encoding='utf-8') as fp:
    alice_text = fp.read()

In [6]:
len(alice_text)

163945

In [71]:
# Separate the text into chapters
chapters = separate_chapters(alice_text)

In [72]:
# Display each chapter separately
for i, (title, text) in enumerate(chapters, start=1):
    print(f"Chapter {i}: {title}")
    print(f"Text:{text}")

Chapter 1: CHAPTER I.
Down the Rabbit-Hole
Text:


Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothing to do: once or twice she had peeped into
the book her sister was reading, but it had no pictures or
conversations in it, “and what is the use of a book,” thought Alice
“without pictures or conversations?”

So she was considering in her own mind (as well as she could, for the
hot day made her feel very sleepy and stupid), whether the pleasure of
making a daisy-chain would be worth the trouble of getting up and
picking the daisies, when suddenly a White Rabbit with pink eyes ran
close by her.

There was nothing so _very_ remarkable in that; nor did Alice think it
so _very_ much out of the way to hear the Rabbit say to itself, “Oh
dear! Oh dear! I shall be late!” (when she thought it over afterwards,
it occurred to her that she ought to have wondered at this, but at the
time it all seemed quite natural); but when the Rabbit actually _took a
w

In [2]:
import sqlite3

# Database setup
def setup_database(db_path):
    
    conn = sqlite3.connect(db_path, timeout = 10)
    cursor = conn.cursor()
    cursor.execute( '''CREATE TABLE IF NOT EXISTS Books(
                id INTEGER PRIMARY KEY AUTOINCREMENT, 
                author TEXT NOT NULL,
                title TEXT NOT NULL,
                pages INTEGER NOT NULL
                );''')
    cursor.execute('''
            CREATE TABLE IF NOT EXISTS Chapters(
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                chapter_number INTEGER NOT NULL,
                chapter_title TEXT NOT NULL,
                chapter_text TEXT NOT NULL,
                book_id INTEGER NOT NULL,
                FOREIGN KEY(book_id) REFERENCES Books(id)
            );
            ''')
    conn.commit()
    conn.close()
    return conn

In [73]:
import sqlite3
import os
import re

# Function to process books and store in the database
def process_books(books_path, db_path):
    for filename in os.listdir(books_path):
        if filename.endswith(".txt"):
            book_path = os.path.join(books_path, filename)
            with open(book_path, 'r', encoding='utf-8') as file:
                print(book_path)
                text = file.read()
                title = text.split('\n')[0].replace("The Project Gutenberg eBook of ", "").strip()
                title_match = re.search(r'Title:(.*?)\n', text, re.DOTALL)
                if title_match:
                    title = title_match.group(1).strip()
                else:
                    title = "Unknown Title"
                print(f"{book_path}:{title}")

                author_pattern = re.compile(r'^Author: (.*)$', re.MULTILINE)
                author_match = author_pattern.search(text)

                chapters = separate_chapters(text)

                for i, (title, text) in enumerate(chapters, start=1):
                    print(f"Chapter {i}: {title}")

                if author_match:
                    author = author_match.group(1)
                else:
                    author = "Unknown author"

                # Insert chapters into the database
                insert_book_and_chapters(db_path, title, author, chapters)
                

# Function to insert book and chapters into the database
def insert_book_and_chapters(db_path, title, author, chapters):
    conn = sqlite3.connect(db_path)
    c = conn.cursor()

    # Insert book into the database
    c.execute('INSERT OR IGNORE INTO Books (title, author, pages) VALUES (?, ?, ?)', (title, author, 300))
    book_id = c.lastrowid

    # Insert chapters into the database
    for i, (chapter_title, text) in enumerate(chapters, start = 1):
        c.execute("INSERT INTO Chapters(book_id, chapter_number, chapter_title, chapter_text) VALUES (?, ?, ?, ?)",
                  (book_id, i, chapter_title, text))
        print("chapter number:", i, "chapter title:", chapter_title, "author:", author, "text:", text )

    conn.commit()
    conn.close()




In [60]:
db_path = 'bookclub.db'
books_path = "../books"
chapters_path = "../alice-chapters"
conn = setup_database(db_path)
process_books(books_path, db_path)

../books\alice.txt
../books\alice.txt:Alice's Adventures in Wonderland
Chapter 1: CHAPTER I.
Down the Rabbit-Hole
Chapter 2: CHAPTER II.
The Pool of Tears
Chapter 3: CHAPTER III.
A Caucus-Race and a Long Tale
Chapter 4: CHAPTER IV.
The Rabbit Sends in a Little Bill
Chapter 5: CHAPTER V.
Advice from a Caterpillar
Chapter 6: CHAPTER VI.
Pig and Pepper
Chapter 7: CHAPTER VII.
A Mad Tea-Party
Chapter 8: CHAPTER VIII.
The Queen’s Croquet-Ground
Chapter 9: CHAPTER IX.
The Mock Turtle’s Story
Chapter 10: CHAPTER X.
The Lobster Quadrille
Chapter 11: CHAPTER XI.
Who Stole the Tarts?
Chapter 12: CHAPTER XII.
Alice’s Evidence
Author Lewis Carroll
chapter number: 1 chapter title: CHAPTER I.
Down the Rabbit-Hole author: Lewis Carroll text: 


Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothing to do: once or twice she had peeped into
the book her sister was reading, but it had no pictures or
conversations in it, “and what is the use of a book,” thought 

Archived Code Blocks

In [None]:
import sqlite3
import os
import re
#300 is a hardcoded page number for now
# Process books and store in database
def process_books(books_path, db_path):
    for filename in os.listdir(books_path):
        if filename.endswith(".txt"):
            book_path = os.path.join(books_path, filename)
            with open(book_path, 'r', encoding='utf-8') as file:
                text = file.read()
                lines = text.split('\n')
                index = lines.index("*** END OF THE PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND ***")
                text = '\n'.join(lines[:index])
                title = text.split('\n')[0].replace("The Project Gutenberg eBook of ", "").strip()
                title = title.replace("*** START OF THE PROJECT GUTENBERG EBOOK", "").strip()
                print(f"{book_path}:{title}")
                # Parse the "Contents" section to get titles
                contents_pattern = re.compile(r'(Contents|CONTENTS)\s*(.*?)(CHAPTER I|CHAPTER 1|THE PREFACE)', re.DOTALL)
                contents_match = contents_pattern.search(text)
                if contents_match:
                    contents_block = contents_match.group(2)
                    # Extract titles from the "Contents" block
                    titles = re.findall(r'(CHAPTER [IVXLCDM]+\.|THE PREFACE|EPILOGUE|PROLOGUE)', contents_block, re.IGNORECASE)

                    # Prepare regex pattern to match all titles in the text
                    titles_regex = '|'.join(re.escape(title) for title in titles)
                for title in titles:
                    print(title)
                author_pattern = re.compile(r'^Author: (.*)$', re.MULTILINE)
                author_match = author_pattern.search(text)
                if author_match:
                    author = author_match.group(1)
                # Split the text based on titles
                sections = re.split(titles_regex, text)[1:]  # Skip the part before the first title
                for i, section in enumerate(sections):
                    # Assuming titles list is aligned with sections
                    if i < len(titles):
                        chapter_title = titles[i]
                    else:
                        chapter_title = f"Chapter {i+1}"
                    chapter_number = str(i + 1)
                    print("title:", title, "author:", author, "chapter_number", chapter_number, "chapter_title", chapter_title, 
                    "section", section)
                    insert_book_and_chapters(db_path, title, author, 300, [(chapter_number, chapter_title, section)])


In [None]:
import sqlite3
import os
import re

# Function to insert a book and its chapters into the database
def insert_book_and_chapters(db_path, title, author, pages, chapters):
    conn = sqlite3.connect(db_path, timeout = 10)
    cursor = conn.cursor()
    # Insert book
    cursor.execute('INSERT OR IGNORE INTO Books (title, author, pages) VALUES (?, ?, ?)', (title, author, pages))
    # Get book_id
    cursor.execute('SELECT id FROM Books WHERE title = ?', (title,))
    book_id = cursor.fetchone()[0]
    # Insert chapters
    for chapter_number, chapter_title, chapter_text in chapters:
        cursor.execute('INSERT INTO Chapters (book_id, chapter_number, chapter_title, chapter_text) VALUES (?, ?, ?, ?)',
                       (book_id, chapter_number, chapter_title, chapter_text))
    conn.commit()
    conn.close()