In [2]:
import re
def separate_chapters(text):
    # Define a regular expression pattern to match chapter titles
    # This pattern matches the word "CHAPTER" followed by any roman numeral and any title.
    chapter_pattern = re.compile(r'CHAPTER\s+[IVXLCDM]+\.\s+.*', re.IGNORECASE)
    
    # Find all matches of the chapter pattern in the text
    chapters = re.split(chapter_pattern, text)
    
    # The first item in the list will be the text before the first chapter, which we don't need
    if chapters:
        chapters = chapters[1:]
    
    # Optionally, we can also retrieve the chapter titles if needed
    chapter_titles = chapter_pattern.findall(text)
    
    # Return a list of tuples where each tuple contains the chapter title and the chapter text
    return list(zip(chapter_titles, chapters))

In [1]:
with open("books/11.txt", 'r') as fp:
    alice_text = fp.read()

In [3]:
len(alice_text)

164047

In [4]:
# Separate the text into chapters
chapters = separate_chapters(alice_text)

In [7]:
# Display each chapter separately
for i, (title, text) in enumerate(chapters, start=1):
    print(f"Chapter {i}: {title}")

Chapter 1: CHAPTER I.     Down the Rabbit-Hole
Chapter 2: CHAPTER II.    The Pool of Tears
Chapter 3: CHAPTER III.   A Caucus-Race and a Long Tale
Chapter 4: CHAPTER IV.    The Rabbit Sends in a Little Bill
Chapter 5: CHAPTER V.     Advice from a Caterpillar
Chapter 6: CHAPTER VI.    Pig and Pepper
Chapter 7: CHAPTER VII.   A Mad Tea-Party
Chapter 8: CHAPTER VIII.  The Queen’s Croquet-Ground
Chapter 9: CHAPTER IX.    The Mock Turtle’s Story
Chapter 10: CHAPTER X.     The Lobster Quadrille
Chapter 11: CHAPTER XI.    Who Stole the Tarts?
Chapter 12: CHAPTER XII.   Alice’s Evidence
Chapter 13: CHAPTER I.
Down the Rabbit-Hole
Chapter 14: CHAPTER II.
The Pool of Tears
Chapter 15: CHAPTER III.
A Caucus-Race and a Long Tale
Chapter 16: CHAPTER IV.
The Rabbit Sends in a Little Bill
Chapter 17: CHAPTER V.
Advice from a Caterpillar
Chapter 18: CHAPTER VI.
Pig and Pepper
Chapter 19: CHAPTER VII.
A Mad Tea-Party
Chapter 20: CHAPTER VIII.
The Queen’s Croquet-Ground
Chapter 21: CHAPTER IX.
The Mock 

In [21]:
import sqlite3
import os
import re

# Database setup
def setup_database(db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('''CREATE TABLE IF NOT EXISTS books
                      (book_id INTEGER PRIMARY KEY, title TEXT UNIQUE)''')
    cursor.execute('''CREATE TABLE IF NOT EXISTS chapters
                      (chapter_id INTEGER PRIMARY KEY, book_id INTEGER, chapter_number TEXT, chapter_title TEXT, chapter_text TEXT,
                      FOREIGN KEY(book_id) REFERENCES books(book_id))''')
    conn.commit()
    return conn

# Function to insert a book and its chapters into the database
def insert_book_and_chapters(conn, title, chapters):
    cursor = conn.cursor()
    # Insert book
    cursor.execute('INSERT OR IGNORE INTO books (title) VALUES (?)', (title,))
    book_id = cursor.lastrowid
    if book_id == 0:  # If the book was already in the database, get its id
        cursor.execute('SELECT book_id FROM books WHERE title = ?', (title,))
        book_id = cursor.fetchone()[0]
    # Insert chapters
    for chapter_number, chapter_title, chapter_text in chapters:
        cursor.execute('INSERT INTO chapters (book_id, chapter_number, chapter_title, chapter_text) VALUES (?, ?, ?, ?)',
                       (book_id, chapter_number, chapter_title, chapter_text))
    conn.commit()

# Process books and store in database
def process_books(books_path, conn):
    for filename in os.listdir(books_path):
        if filename.endswith(".txt"):
            book_path = os.path.join(books_path, filename)
            with open(book_path, 'r', encoding='utf-8') as file:
                text = file.read()
                title = text.split('\n')[0].replace("The Project Gutenberg eBook of ", "").strip()
                title = title.replace("*** START OF THE PROJECT GUTENBERG EBOOK", "").strip()
                print(f"{book_path}:{title}")
                # Parse the "Contents" section to get titles
                contents_pattern = re.compile(r'(Contents|CONTENTS)\s*(.*?)(CHAPTER I|CHAPTER 1|THE PREFACE)', re.DOTALL)
                contents_match = contents_pattern.search(text)
                if contents_match:
                    contents_block = contents_match.group(2)
                    # Extract titles from the "Contents" block
                    titles = re.findall(r'(CHAPTER [IVXLCDM]+\.|THE PREFACE|EPILOGUE|PROLOGUE)', contents_block, re.IGNORECASE)

                    # Prepare regex pattern to match all titles in the text
                    titles_regex = '|'.join(re.escape(title) for title in titles)

                # Split the text based on titles
                sections = re.split(titles_regex, text)[1:]  # Skip the part before the first title
                for i, section in enumerate(sections):
                    # Assuming titles list is aligned with sections
                    if i < len(titles):
                        chapter_title = titles[i]
                    else:
                        chapter_title = f"Section {i+1}"
                    chapter_number = str(i + 1)
                    insert_book_and_chapters(conn, title, [(chapter_number, chapter_title, section)])


In [22]:
db_path = 'books_database.db'
books_path = "./books/"
conn = setup_database(db_path)
process_books(books_path, conn)

./books/100.txt:﻿ THE COMPLETE WORKS OF WILLIAM


UnboundLocalError: cannot access local variable 'titles_regex' where it is not associated with a value