In [3]:
import os
import glob
from bs4 import BeautifulSoup
import re

In [30]:
# 将 .\Epub source 中每一卷文件夹中的所有 .html 文件各合并为一个 .txt 文件，存于 .\Output 文件夹中

def get_text_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup.get_text()

# Define the root folder
root_folder = os.path.relpath(r'..\Epub source')

# Iterate over the subfolders within the root folder
for folder in sorted(os.listdir(root_folder)):
    folder_path = os.path.join(root_folder, folder)
    
    # Check if the item is a subfolder
    if os.path.isdir(folder_path):
        # Accumulate the text from HTML files
        text = ""
        
        # Find all .html files within the subfolder
        html_files = glob.glob(os.path.join(folder_path, "*.html"))
        
        # Iterate over the HTML files
        for file in sorted(html_files):
            with open(file, "r") as f:
                html_content = f.read()
                text += get_text_from_html(html_content)
        
        # Write the accumulated text to a .txt file
        output_file = os.path.join(os.path.relpath(r'..\Output'), folder + ".txt")
        with open(output_file, "w") as f:
            f.write(text)

In [31]:
def clean_text(txt):

    # Remove the "未知" lines
    pattern = r'\n*\s*未知\s*\n*'
    cleaned_txt = re.sub(pattern, '', txt)

    # Remove the "Chapter_[1,10]\n" lines
    cleaned_txt = re.sub(r'Chapter_\d+\n', '', cleaned_txt)

    # Remove 注释 and delineate each passage by '-----'
    pattern = r'--------------------------------------------------------------------------------\n*?\s*注释\s*(\[.*?\].*\s*)*\n*'
    cleaned_txt = re.sub(pattern, r'-----\n', cleaned_txt)
    # To account for an anomoly whereby 注释 is not preceeded by a line containing eighty hypens :(
    pattern = r'\s*注释\s*(\[.*?\].*\s*)*\n*'
    cleaned_txt = re.sub(pattern, r'\n\n\n\n-----\n', cleaned_txt)

    # Remove '毛泽东文集 第？卷\s*'
    pattern = r'毛泽东文集 第.*?卷\s*'
    cleaned_txt = re.sub(pattern, '', cleaned_txt)

    # Remove all in-text annotation markers
    pattern = r'\[.*?\]'
    cleaned_txt = re.sub(pattern, '', cleaned_txt)

    return cleaned_txt.strip()

# Define the root folder
root_folder = os.path.relpath(r'..\Output')

# Iterate over the .txt files within the root folder
for file in sorted(glob.glob(os.path.join(root_folder, "*.txt"))):
    with open(file, "r") as f:
        txt = f.read()
        cleaned_txt = clean_text(txt)
        
        # Write the cleaned text back to the .txt file
        with open(file, "w") as f:
            f.write(cleaned_txt)