In [None]:
import numpy
print(numpy.__version__)

In [39]:
import os
import csv
import re  # Ensure this line is included for regex operations
from bs4 import BeautifulSoup
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

In [41]:
# Specify the directory with the HTML files
html_directory = "HTML_BOOK_LIST"
output_txt_directory = "TXT_FILES"
output_csv_file = "BannedBooks_metadata.csv"

# Ensure output directories exist
os.makedirs(output_txt_directory, exist_ok=True)

# Initialize a list to store metadata
Banned_Books_metadata = []

import re

# Function to clean the author name by removing years or other metadata in parentheses
def clean_author_name(author_name):
    # Remove any text inside parentheses, including the parentheses themselves
    return re.sub(r'\s?\(.*?\)', '', author_name).strip()

# Function to extract the author
def extract_author_from_preamble(preamble):
    # Search for "Author:" pattern in the preamble
    author_match = re.search(r"^\s*Author[:\s]+(.+)$", preamble, re.IGNORECASE | re.MULTILINE)
    if author_match:
        author_name = author_match.group(1).strip()
        return clean_author_name(author_name)

    # Fallback: Look for lines with "by" indicating the author
    lines = preamble.splitlines()
    for line in lines:
        line = line.strip()
        # If "by" is in the line, treat the text after it as the author
        if "by" in line.lower():
            author_name = line.lower().split("by")[-1].strip()  # Split after "by" and strip the author name
            if author_name:
                return clean_author_name(author_name)

    return "Unknown Author"

# Function to extract the title
def extract_title_from_preamble(preamble):
    # Search for "Title:" pattern in the preamble
    title_match = re.search(r"^\s*Title[:\s]+(.+)$", preamble, re.IGNORECASE | re.MULTILINE)
    if title_match:
        return title_match.group(1).strip()

    # Fallback: Look for uppercase lines that could be the title, excluding "by" lines
    lines = preamble.splitlines()
    for line in lines:
        line = line.strip()
        # We want to ensure that the title is an uppercase line and does not contain "by"
        # Titles are typically uppercase and are not followed by "by"
        if len(line) > 3 and line.isupper() and "by" not in line:
            # This should work for lines like "ANTIC HAY"
            return line.strip()

    # If no title was found, return "Unknown Title"
    return "Unknown Title"

# Function to clean the text (remove excessive whitespace and non-alphanumeric characters)
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace excessive whitespace with a single space
    text = re.sub(r'[^\w\s.,;!?\'"-]', '', text)  # Keep basic punctuation
    return text.strip()

# Example usage in your existing function
def extract_metadata_and_clean_text(text):
    # Markers for the Project Gutenberg eBook
    start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK"
    end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"

    # Extract the preamble (everything before the start marker)
    start_idx = text.find(start_marker)
    preamble = text[:start_idx].strip() if start_idx != -1 else text.strip()

    # Debug: Print the preamble to verify its structure
    #print("Preamble Preview:", preamble[:1000])

    # Extract the title and author from the preamble
    title = extract_title_from_preamble(preamble)
    author = extract_author_from_preamble(preamble)

    # Debug: Print the extracted title and author
    print("Extracted Title:", title)
    print("Extracted Author:", author)

    # Extract the main book text (everything between start and end markers)
    if start_idx != -1:
        text = text[start_idx + len(start_marker):]
    end_idx = text.find(end_marker)
    if end_idx != -1:
        text = text[:end_idx]

    # Clean the text
    text = clean_text(text)  # Use the clean_text function here
    return text, title, author


# Process each HTML file
for filename in os.listdir(html_directory):
    if filename.endswith(".html"):
        file_path = os.path.join(html_directory, filename)
        with open(file_path, "r", encoding="utf-8") as file:
            soup = BeautifulSoup(file, "html.parser")
            book_text = soup.get_text(separator="\n")
            cleaned_text, title, author = extract_metadata_and_clean_text(book_text)

            # Save cleaned text
            txt_filename = f"{os.path.splitext(filename)[0]}.txt"
            txt_filepath = os.path.join(output_txt_directory, txt_filename)
            with open(txt_filepath, "w", encoding="utf-8") as txt_file:
                txt_file.write(cleaned_text)

            # Append metadata
            Banned_Books_metadata.append({"Title": title, "Author": author, "FileName": txt_filename})

# Write metadata to a CSV file
with open(output_csv_file, "w", encoding="utf-8", newline="") as csv_file:
    fieldnames = ["Title", "Author", "FileName"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(Banned_Books_metadata)

print("Processing complete.")

Extracted Title: Lady Chatterley's lover
Extracted Author: D. H. Lawrence
Extracted Title: Uncle Tom's Cabin
Extracted Author: Harriet Beecher Stowe
Extracted Title: The Sun Also Rises
Extracted Author: Ernest Hemingway
Extracted Title: The Great Gatsby
Extracted Author: F. Scott Fitzgerald
Extracted Title: The Origin of Species by Means of Natural Selection
Extracted Author: Charles Darwin
Extracted Title: Three Weeks
Extracted Author: Elinor Glyn
Extracted Title: Unknown Title
Extracted Author: aldous huxley
Extracted Title: Oil!
Extracted Author: Upton Sinclair
Extracted Title: Jude the Obscure
Extracted Author: Thomas Hardy
Extracted Title: Ulysses
Extracted Author: James Joyce
Extracted Title: Narrative of the Life of Frederick Douglass, an American Slave
Extracted Author: Frederick Douglass
Extracted Title: The Scarlet Letter
Extracted Author: Nathaniel Hawthorne
Extracted Title: Strange Interlude (1928)
Extracted Author: Eugene O'Neill
Extracted Title: Incidents in the Life of a

In [93]:
ns_txt_file = 'non_scrapables.csv'

# Read metadata from non-scrapable books
df = pd.read_csv(output_csv_file)
df_2 = pd.read_csv(ns_txt_file)

# Merge metadata and original DataFrames into a new, complete one 
final_df = pd.concat([df, df_2], ignore_index=True)
final_df.to_csv(output_csv_file)
final_df


Unnamed: 0,Title,Author,FileName
0,Lady Chatterley's lover,D. H. Lawrence,Lady Chatterley's Lover_D.H. Lawrence.txt
1,Uncle Tom's Cabin,Harriet Beecher Stowe,Uncle Tom's Cabin_ Harriet Beecher Stowe.txt
2,The Sun Also Rises,Ernest Hemingway,The Sun Also Rises_ Ernest Hemingway.txt
3,The Great Gatsby,F. Scott Fitzgerald,The Great Gatbsy.txt
4,The Origin of Species by Means of Natural Sele...,Charles Darwin,The Origin of Species by Means of Natural Sele...
5,Three Weeks,Elinor Glyn,Three Weeks_Elinor Glyn.txt
6,Unknown Title,aldous huxley,Antic Hay_Aldous Huxley.txt
7,Oil!,Upton Sinclair,Oil!_ Upton Sinclair.txt
8,Jude the Obscure,Thomas Hardy,Jude the Obscure_ Thomas Hardy.txt
9,Ulysses,James Joyce,Ulysses_James Joyce.txt


In [95]:
# The CSV file contains special characters and might have a BOM (Byte Order Mark),
# so we use 'utf-8-sig' encoding to ensure proper reading of the file without encoding issues.
df = pd.read_csv(output_csv_file, encoding='utf-8-sig')

# Save it again as UTF-8-sig
df.to_csv('Banned_Books_Metadata.csv', encoding='utf-8-sig', index=False)
os.remove(output_csv_file)

In [103]:
# Specify the directory with the non scrapables txt files
ns_txt_directory = "NON_SCRAPABLES_TXT"
output_txt_directory = "TXT_FILES"

# Ensure output directories exist
os.makedirs(output_txt_directory, exist_ok=True)

# Process each TXT file
for filename in os.listdir(ns_txt_directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(ns_txt_directory, filename)
        with open(file_path, "r", encoding="utf-8") as file:
            # Save text
            txt_filename = f"{os.path.splitext(filename)[0]}.txt"
            txt_filepath = os.path.join(output_txt_directory, txt_filename)
            with open(txt_filepath, "w", encoding="utf-8") as txt_file:
                txt_file.write(clean_text(file.read()))

print("Processing complete.")

Processing complete.


In [None]:
#Load and Read Text Files// Organize your text files in a directory and load them programmatically for processing.
import os

# Directory containing your text files
text_dir = "TXT_FILES"

# Load all text files
text_data = {}
for filename in os.listdir(text_dir):
    if filename.endswith(".txt"):
        with open(os.path.join(text_dir, filename), 'r', encoding='utf-8') as file:
            text_data[filename] = file.read()

In [None]:
import spacy
from nltk.corpus import stopwords
import string

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Increase the max length to 2 million characters
nlp.max_length = 2000000

# Define custom stopwords
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# Preprocessing function
def preprocess_text(text):
    doc = nlp(text)
    tokens = []
    
    for token in doc:
        # Filter tokens: stopwords, punctuation, non-alphabetic words
        if token.text.lower() not in stop_words and token.text not in punctuation and token.is_alpha:
            tokens.append(token.lemma_.lower())  # Lemmatized, lowercase words
    return " ".join(tokens)

# Process all text files
preprocessed_texts = {}

for filename, content in text_data.items():
    preprocessed_texts[filename] = preprocess_text(content)

# Save preprocessed texts
output_dir = "FINAL_TXT"
os.makedirs(output_dir, exist_ok=True)

for filename, processed_text in preprocessed_texts.items():
    with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as file:
        file.write(processed_text)