In [1]:
# !python -m spacy download en_core_web_sm

In [2]:
import os
import spacy
import pandas as pd
from tqdm.notebook import tqdm
import json
import glob

In [3]:
import re

def preprocess_entity(entity):
    """
    Preprocess the extracted person entity by:
    - Stripping leading/trailing whitespace
    - Removing surrounding quotes and other non-alphanumeric characters
    - Replacing newline characters and excessive spaces
    - Removing non-alphabetic characters except spaces and hyphens
    - Standardizing capitalization to title case
    """
    if not isinstance(entity, str):
        return ""
    
    # Remove leading/trailing whitespace
    entity = entity.strip()
    
    # Remove surrounding quotes and other enclosing characters
    entity = re.sub(r'^[\"\'\`]+|[\"\'\`]+$', '', entity)
    
    # Replace newline characters and excessive spaces with a single space
    entity = re.sub(r'\s+', ' ', entity)
    
    # Remove non-alphabetic characters except spaces and hyphens
    entity = re.sub(r'[^\w\s-]', '', entity)
    
    # Standardize to title case
    entity = entity.title()
    
    return entity


In [4]:
def is_valid_person(entity, min_length=2, max_length=100):
    """
    Validates whether the preprocessed entity is a likely person name.

    Parameters:
    - entity (str): The preprocessed person entity.
    - min_length (int): Minimum length of the name.
    - max_length (int): Maximum length of the name.

    Returns:
    - bool: True if valid, False otherwise.
    """
    if not entity:
        return False
    
    # Length-based filtering
    if len(entity) < min_length or len(entity) > max_length:
        return False
    
    # Exclude entities with digits
    if re.search(r'\d', entity):
        return False
    
    # Exclude entities that consist solely of uppercase letters (likely acronyms)
    if entity.isupper() and len(entity) > 1:
        return False
    
    # Exclude entities with two or more consecutive hyphens
    if re.search(r'-{2,}', entity):
        return False
    
    # Exclude entities containing 'And' or 'Or' as separate words
    if re.search(r'\b(?:And|Or)\b', entity):
        return False
    
    # Exclude entities prefixed with honorifics
    honorifics = ['Miss', 'Mr', 'Mrs', 'Ms', 'Dr', 'Sir', 'Lady', 'Lord', 'Prof', 'Rev']
    # Create a regex pattern to match any honorific at the start followed by a space
    honorific_pattern = r'^(?:' + '|'.join(honorifics) + r')\s+'
    if re.match(honorific_pattern, entity):
        return False
    
    # Additional heuristic: At least one uppercase character followed by lowercase (simple pattern)
    if not re.search(r'[A-Z][a-z]+', entity):
        return False
    
    return True


In [5]:
import spacy

def extract_valid_person_entities(text, nlp_model, chunk_size=5000):
    """
    Extracts and validates unique person entities from the input text using spaCy by processing in chunks.

    Parameters:
    - text (str): The text to analyze.
    - nlp_model: The loaded spaCy NLP model.
    - chunk_size (int): Number of characters per chunk.

    Returns:
    - List of validated unique person entities.
    """
    person_entities = []
    doc = nlp_model(text)
    
    # Use spaCy's sentence segmentation to split the text into sentences
    sentences = list(doc.sents)
    
    current_chunk = ""
    for sentence in sentences:
        sentence_text = sentence.text.strip()
        # Check if adding this sentence exceeds the chunk size
        if len(current_chunk) + len(sentence_text) + 1 > chunk_size:
            # Process the current chunk
            if current_chunk:
                chunk_doc = nlp_model(current_chunk)
                entities = [ent.text for ent in chunk_doc.ents if ent.label_ == 'PERSON']
                person_entities.extend(entities)
                current_chunk = ""
        # Add the sentence to the current chunk
        current_chunk += " " + sentence_text
    
    # Process any remaining text in the current chunk
    if current_chunk:
        chunk_doc = nlp_model(current_chunk)
        entities = [ent.text for ent in chunk_doc.ents if ent.label_ == 'PERSON']
        person_entities.extend(entities)
    
    # Preprocess entities
    cleaned_entities = [preprocess_entity(entity) for entity in person_entities]
    
    # Validate entities
    valid_entities = [entity for entity in cleaned_entities if is_valid_person(entity)]
    
    # Remove duplicates
    unique_valid_entities = list(set(valid_entities))
    
    return unique_valid_entities


In [6]:
def save_entities_to_json(book_title, person_entities, output_dir):
    """
    Saves the list of person entities for a book into a JSON file.
    
    Parameters:
    - book_title (str): The title of the book.
    - person_entities (list): List of validated person entities.
    - output_dir (str): Directory where the JSON file will be saved.
    
    Returns:
    - None
    """
    # Sanitize the book title to create a valid filename
    sanitized_title = re.sub(r'[\\/*?:"<>|]', "", book_title)
    sanitized_title = sanitized_title.replace(' ', '_')  # Replace spaces with underscores for readability
    
    # Define the path for the individual JSON file
    individual_json_path = os.path.join(output_dir, f"{sanitized_title}_person_entities.json")
    
    # Structure the data as a dictionary
    book_data = {
        'book_title': book_title,
        'person_entities': person_entities
    }
    
    # Save the person entities to the individual JSON file
    try:
        with open(individual_json_path, 'w', encoding='utf-8') as json_file:
            json.dump(book_data, json_file, indent=4)
        print(f"Saved person entities for '{book_title}' to '{individual_json_path}'.")
    except Exception as e:
        print(f"Failed to save '{book_title}' due to error: {e}")

In [7]:
def process_single_book(book_path, nlp_model, output_dir, chunk_size=1000000):
    """
    Processes a single book: loads content, extracts and cleans person entities, and saves them.

    Parameters:
    - book_path (str): Path to the book's text file.
    - nlp_model: The loaded spaCy NLP model.
    - output_dir (str): Directory where the JSON file will be saved.
    - chunk_size (int): Number of characters per chunk for processing.

    Returns:
    - None
    """
    book_title = os.path.basename(book_path).replace('.txt', '').replace('_', ' ')
    
    try:
        with open(book_path, 'r', encoding='utf-8') as file:
            book_text = file.read()
        
        print(f"Successfully loaded '{book_title}'.")
        
        # Determine processing strategy based on text length
        if len(book_text) <= nlp_model.max_length:
            # Process the entire text
            valid_persons = extract_valid_person_entities(book_text, nlp_model)
        else:
            # Process in chunks
            print(f"Text length ({len(book_text)}) exceeds max_length ({nlp_model.max_length}). Processing in chunks.")
            valid_persons = extract_valid_person_entities(text=book_text, nlp_model=nlp_model, chunk_size=chunk_size)
        
        print(f"Number of valid person entities found in '{book_title}': {len(valid_persons)}")
        
        # Save the valid entities to a JSON file
        save_entities_to_json(book_title, valid_persons, output_dir)
        
    except Exception as e:
        print(f"Error processing '{book_title}': {e}")

In [8]:
CLEANED_DIR = '../data/selected_100_books/'
OUTPUT_DIR = '../data/processed/ner_results/'
single_book_path = os.path.join(CLEANED_DIR, 'Agatha Christie___The Secret Adversary.txt')
process_single_book(single_book_path, spacy.load('en_core_web_sm'), OUTPUT_DIR)

Successfully loaded 'Agatha Christie   The Secret Adversary'.
Number of valid person entities found in 'Agatha Christie   The Secret Adversary': 156
Saved person entities for 'Agatha Christie   The Secret Adversary' to '../data/processed/ner_results/Agatha_Christie___The_Secret_Adversary_person_entities.json'.


In [9]:
def process_all_books(cleaned_dir, output_dir):
    """
    Processes all books in the specified directory to extract and save person entities.
    
    Parameters:
    - cleaned_dir (str): Directory containing the cleaned book text files.
    - output_dir (str): Directory where individual JSON files will be saved.
    
    Returns:
    - None
    """
    # Get list of all cleaned text files in the selected 100 books directory
    cleaned_files = glob.glob(os.path.join(cleaned_dir, '*.txt'))
    
    print(f"Number of books to process: {len(cleaned_files)}")
    
    # Initialize spaCy's English model
    nlp = spacy.load('en_core_web_sm')
    # Iterate through each book and process
    for book_path in tqdm(cleaned_files, desc="Processing Books"):
        process_single_book(book_path, nlp, output_dir)

In [10]:
# Define the directory containing the selected 100 books
CLEANED_DIR = '../data/selected_100_books/'

# Define the directory where individual JSON files will be saved
OUTPUT_DIR = '../data/processed/ner_results/individual_books/'

# Create the output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Process all books
process_all_books(cleaned_dir=CLEANED_DIR, output_dir=OUTPUT_DIR)

Number of books to process: 101


Processing Books:   0%|          | 0/101 [00:00<?, ?it/s]

Successfully loaded 'Agatha Christie   The Secret Adversary'.
Number of valid person entities found in 'Agatha Christie   The Secret Adversary': 156
Saved person entities for 'Agatha Christie   The Secret Adversary' to '../data/processed/ner_results/individual_books/Agatha_Christie___The_Secret_Adversary_person_entities.json'.
Successfully loaded 'Alfred Russel Wallace   Island Life'.
Text length (1057182) exceeds max_length (1000000). Processing in chunks.
Error processing 'Alfred Russel Wallace   Island Life': [E088] Text of length 1057182 exceeds maximum of 1000000. The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.
Successfully loaded 'Andrew Lang   A Short History o

KeyboardInterrupt: 

In [21]:
# Define the directory containing the selected 100 books
CLEANED_DIR = '../data/selected_100_books/'

def calculate_max_length(book_dir):
    """
    Calculates the maximum character length among all books in the directory.

    Parameters:
    - book_dir (str): The directory containing the books.

    Returns:
    - max_length (int): The length of the longest book.
    """
    # Get list of all text files in the directory
    book_files = glob.glob(os.path.join(book_dir, '*.txt'))
    
    max_length = 0
    longest_book = ""
    
    # Iterate through each book and calculate character count
    for book_path in tqdm(book_files, desc="Calculating book lengths"):
        try:
            with open(book_path, 'r', encoding='utf-8') as file:
                text = file.read()
                char_count = len(text)
                # Update max_length if the current book is longer
                if char_count > max_length:
                    max_length = char_count
                    longest_book = os.path.basename(book_path)
        except Exception as e:
            print(f"Error reading {book_path}: {e}")
    
    print(f"\nThe longest book is '{longest_book}' with {max_length} characters.")
    return max_length

# Calculate the maximum character length for all books
max_length = calculate_max_length(CLEANED_DIR)

# Print the suggested nlp.max_length based on the longest book
print(f"\nSuggested nlp.max_length: {max_length + 100000}")  # Adding some buffer space

Calculating book lengths:   0%|          | 0/101 [00:00<?, ?it/s]


The longest book is 'William Makepeace Thackeray   The Newcomes.txt' with 2036713 characters.

Suggested nlp.max_length: 2136713
