In [1]:
import re
def separate_chapters(text):
    # Define a regular expression pattern to match chapter titles
    # This pattern matches the word "CHAPTER" followed by any roman numeral and any title.
    chapter_pattern = re.compile(r'CHAPTER\s+[IVXLCDM]+\.\s+.*', re.IGNORECASE)
    
    # Find all matches of the chapter pattern in the text
    chapters = re.split(chapter_pattern, text)
    
    # The first item in the list will be the text before the first chapter, which we don't need
    if chapters:
        chapters = chapters[1:]
    
    # Optionally, we can also retrieve the chapter titles if needed
    chapter_titles = chapter_pattern.findall(text)
    
    # Return a list of tuples where each tuple contains the chapter title and the chapter text
    return list(zip(chapter_titles, chapters))

In [15]:
#Test with alice 
with open("../books.txt", 'r', encoding='utf-8') as fp:
    alice_text = fp.read()

In [16]:
len(alice_text)

11471

In [17]:
# Separate the text into chapters
chapters = separate_chapters(alice_text)

In [18]:
# Display each chapter separately
for i, (title, text) in enumerate(chapters, start=1):
    print(f"Chapter {i}: {title}")

Chapter 1: CHAPTER I.
Down the Rabbit-Hole


In [4]:
import sqlite3
import os
import re

# Function to insert a book and its chapters into the database
def insert_book_and_chapters(db_path, title, author, pages, chapters):
    conn = sqlite3.connect(db_path, timeout = 10)
    cursor = conn.cursor()
    # Insert book
    cursor.execute('INSERT OR IGNORE INTO Books (title, author, pages) VALUES (?, ?, ?)', (title, author, pages))
    # Get book_id
    cursor.execute('SELECT id FROM Books WHERE title = ?', (title,))
    book_id = cursor.fetchone()[0]
    # Insert chapters
    for chapter_number, chapter_title, chapter_text in chapters:
        cursor.execute('INSERT INTO Chapters (book_id, chapter_number, chapter_title, chapter_text) VALUES (?, ?, ?, ?)',
                       (book_id, chapter_number, chapter_title, chapter_text))
    conn.commit()
    conn.close()

In [3]:
import sqlite3

# Database setup
def setup_database(db_path):
    
    conn = sqlite3.connect(db_path, timeout = 10)
    cursor = conn.cursor()
    cursor.execute( '''CREATE TABLE IF NOT EXISTS Books(
                id INTEGER PRIMARY KEY AUTOINCREMENT, 
                author TEXT NOT NULL,
                title TEXT NOT NULL,
                pages INTEGER NOT NULL
                );''')
    cursor.execute('''
            CREATE TABLE IF NOT EXISTS Chapters(
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                chapter_number INTEGER NOT NULL,
                chapter_title TEXT NOT NULL,
                chapter_text TEXT NOT NULL,
                book_id INTEGER NOT NULL,
                FOREIGN KEY(book_id) REFERENCES Books(id)
            );
            ''')
    conn.commit()
    conn.close()
    return conn

In [2]:
import sqlite3
import os
import re
#300 is a hardcoded page number for now
# Process books and store in database
def process_books(books_path, db_path):
    for filename in os.listdir(books_path):
        if filename.endswith(".txt"):
            book_path = os.path.join(books_path, filename)
            with open(book_path, 'r', encoding='utf-8') as file:
                text = file.read()
                title = text.split('\n')[0].replace("The Project Gutenberg eBook of ", "").strip()
                title = title.replace("*** START OF THE PROJECT GUTENBERG EBOOK", "").strip()
                print(f"{book_path}:{title}")
                # Parse the "Contents" section to get titles
                contents_pattern = re.compile(r'(Contents|CONTENTS)\s*(.*?)(CHAPTER I|CHAPTER 1|THE PREFACE)', re.DOTALL)
                contents_match = contents_pattern.search(text)
                if contents_match:
                    contents_block = contents_match.group(2)
                    # Extract titles from the "Contents" block
                    titles = re.findall(r'(CHAPTER [IVXLCDM]+\.|THE PREFACE|EPILOGUE|PROLOGUE)', contents_block, re.IGNORECASE)

                    # Prepare regex pattern to match all titles in the text
                    titles_regex = '|'.join(re.escape(title) for title in titles)

                author_pattern = re.compile(r'^Author: (.*)$', re.MULTILINE)
                author_match = author_pattern.search(text)
                if author_match:
                    author = author_match.group(1)
                # Split the text based on titles
                sections = re.split(titles_regex, text)[1:]  # Skip the part before the first title
                for i, section in enumerate(sections):
                    # Assuming titles list is aligned with sections
                    if i < len(titles):
                        chapter_title = titles[i]
                    else:
                        chapter_title = f"Section {i+1}"
                    chapter_number = str(i + 1)
                    print("title:", title, "author:", author, "chapter_number", chapter_number, "chapter_title", chapter_title, 
                    "section", section)
                    insert_book_and_chapters(db_path, title, author, 300, [(chapter_number, chapter_title, section)])


In [5]:
db_path = 'bookclub.db'
books_path = "../books/"
conn = setup_database(db_path)
process_books(books_path, db_path)

../books/alice.txt:Alice's Adventures in Wonderland
Alice's Adventures in Wonderland Lewis Carroll 1 Section 1 T
Alice's Adventures in Wonderland Lewis Carroll 2 Section 2 h
Alice's Adventures in Wonderland Lewis Carroll 3 Section 3 e
Alice's Adventures in Wonderland Lewis Carroll 4 Section 4  
Alice's Adventures in Wonderland Lewis Carroll 5 Section 5 P
Alice's Adventures in Wonderland Lewis Carroll 6 Section 6 r
Alice's Adventures in Wonderland Lewis Carroll 7 Section 7 o
Alice's Adventures in Wonderland Lewis Carroll 8 Section 8 j
Alice's Adventures in Wonderland Lewis Carroll 9 Section 9 e
Alice's Adventures in Wonderland Lewis Carroll 10 Section 10 c
Alice's Adventures in Wonderland Lewis Carroll 11 Section 11 t
Alice's Adventures in Wonderland Lewis Carroll 12 Section 12  
Alice's Adventures in Wonderland Lewis Carroll 13 Section 13 G
Alice's Adventures in Wonderland Lewis Carroll 14 Section 14 u
Alice's Adventures in Wonderland Lewis Carroll 15 Section 15 t
Alice's Adventures in

KeyboardInterrupt: 