In [69]:
import os
import glob
from bs4 import BeautifulSoup
import re

In [70]:
# 将 .\Epub source 中每一卷文件夹中的所有 .html 文件各合并为一个 .txt 文件，存于 .\Output 文件夹中

def get_text_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup.get_text()

# Define the root folder
root_folder = os.path.relpath(r'..\..\texts\毛泽东文集\epub_source')

# Create temp output folder
os.makedirs(os.path.relpath(r'..\..\texts\毛泽东文集\temp'), exist_ok=True)

# Iterate over the subfolders within the root folder
for folder in sorted(os.listdir(root_folder)):
    folder_path = os.path.join(root_folder, folder)
    
    # Check if the item is a subfolder
    if os.path.isdir(folder_path):
        # Accumulate the text from HTML files
        text = ""
        
        # Find all .html files within the subfolder
        html_files = glob.glob(os.path.join(folder_path, "*.html"))
        
        # Iterate over the HTML files
        for file in sorted(html_files):
            with open(file, "r") as f:
                html_content = f.read()
                text += get_text_from_html(html_content)
        
        # Write the accumulated text to a .txt file
        output_file = os.path.join(os.path.relpath(r'..\..\texts\毛泽东文集\temp'), folder + ".txt")
        with open(output_file, "w") as f:
            f.write(text)

In [71]:
def clean_text(txt):

    # Remove the "未知" lines
    pattern = r'\n*\s*未知\s*\n*'
    cleaned_txt = re.sub(pattern, '', txt)

    # Remove the "Chapter_[1,10]\n" lines
    cleaned_txt = re.sub(r'Chapter_\d+\n', '', cleaned_txt)

    # Remove a certain kind of anomalous lines
    pattern = r'\\s*\n*'
    cleaned_txt = re.sub(pattern, '', cleaned_txt)

    # Account for anomolies where THEY USE FULL WIDTH SQUARE BRACKETS??!!
    cleaned_txt = re.sub('［', r'[', cleaned_txt)
    cleaned_txt = re.sub('］', r']', cleaned_txt)

    # Another anomaly
    pattern = r'\n“.{245,}”\n'
    cleaned_txt = re.sub(pattern, '\n', cleaned_txt)

    # I give up...
    cleaned_txt = re.sub(r'\n毛泽东主席说，目前的形势对全世界争取和平的人民有利。总的趋势是东风压倒西风。毛主席说，美帝国主义九年来侵占了我国领土台湾，不久以前又派遣它的武装部队侵占了黎巴嫩。美国在全世界许多国家建立了几百个军事基地。中国领土台湾、黎巴嫩以及所有美国在外国的军事基地，都是套在美帝国主义脖子上的绞索。不是别人而是美国人自己制造这种绞索，并把它套在自己的脖子上，而把绞索的另一端交给了中国人民、阿拉伯各国人民和全世界一切爱和平反侵略的人民。美国侵略者在这些地方停留得越久，套在它的头上的绞索就将越紧。\n毛泽东主席又说，美帝国主义在全世界到处制造紧张局势。以期达到它侵略和奴役各国人民的目的。美帝国主义自以为紧张局势总是对它自己有利，但是事实是，美国制造的这些紧张局势走向了美国人愿望的反面，它起了动员全世界人民起来反对美国侵略者的作用。毛主席说，美国垄断资本集团如果坚持推行它的侵略政策和战争政策，势必有一天要被全世界人民处以绞刑。其他美国帮凶也将是这样。\n毛主席对于中美两国在华沙即将开始的大使级代表的谈判寄予希望。他说：如果双方具有解决问题的诚意的话，谈判可能会取得某些成果。现在全世界人民都在注视着两国代表将要进行的谈判。\n', '', cleaned_txt)
    cleaned_txt = re.sub(r'\n一九五二年，德意志联邦共和国于一九五五年，西班牙于一九八二年，波兰、捷克和匈牙利于一九九九年，正式加入该组织。\n', '', cleaned_txt)

    # Remove 注释 and delineate each passage by '-----'
    pattern = r'--------------------------------------------------------------------------------\n*?\s*注释\s*(\[.*?\].*\s*)*\n*'
    cleaned_txt = re.sub(pattern, r'-----\n', cleaned_txt)
    # To account for an anomoly whereby 注释 is not preceeded by a line containing eighty hypens :(
    pattern = r'\s*注释\s*(\[.*?\].*\s*)*\n*'
    cleaned_txt = re.sub(pattern, r'\n\n\n\n-----\n', cleaned_txt)

    # Remove '毛泽东文集 第？卷\s*'
    pattern = r'毛泽东文集 第.*?卷\s*'
    cleaned_txt = re.sub(pattern, '', cleaned_txt)

    # Remove all in-text annotation markers
    pattern = r'\[.*?\]'
    cleaned_txt = re.sub(pattern, '', cleaned_txt)

    return cleaned_txt.strip()

# Define the root folder
root_folder = os.path.relpath(r'..\..\texts\毛泽东文集\temp')

# Iterate over the .txt files within the root folder
for file in sorted(glob.glob(os.path.join(root_folder, "*.txt"))):
    with open(file, "r") as f:
        txt = f.read()
        cleaned_txt = clean_text(txt)
        
        # Write the cleaned text back to the .txt file
        with open(file, "w") as f:
            f.write(cleaned_txt)

In [72]:
import os

# Define the root folder
root_folder = os.path.relpath(r'..\..\texts\毛泽东文集')

# Define the temp folder
temp_folder = os.path.relpath(r'..\..\texts\毛泽东文集\temp')


# Define a function to strip invalid characters from a file name
def sanitize_file_name(file_name):
    # Define the pattern for invalid characters
    pattern = r'[\/:*?"<>|]'

    # Remove invalid characters from the file name
    sanitized_file_name = re.sub(pattern, '', file_name)

    return sanitized_file_name



# Iterate over the .txt files within the temp folder
for file in sorted(os.listdir(temp_folder)):
    if file.endswith(".txt"):
        # Get the file path
        file_path = os.path.join(temp_folder, file)
        
        # Read the content of the file
        with open(file_path, "r") as f:
            content = f.read()
        
        # Split the content by '-----'
        passages = content.split('-----\n')
        
        # Create a directory with the same name as the file in the root folder
        folder_name = os.path.splitext(file)[0]
        folder_path = os.path.join(root_folder, folder_name)
        os.makedirs(folder_path, exist_ok=True)
        
        # Iterate over the passages and write them to separate files
        for i, passage in enumerate(passages):
            # Skip empty passages
            if not passage.strip():
                continue
            
            # Get the first line after '-----' as the file name
            lines = passage.strip().split('\n')
            file_name = lines[0]
            
            # Create the file path
            passage_file_path = os.path.join(folder_path, f"{i+1}_{file_name}.txt")
            
            # Write the passage to the file
            try:
                with open(passage_file_path, "w") as f:
                    f.write(passage)
            except OSError:
                with open(sanitize_file_name(passage_file_path), "w") as f:
                    f.write(passage)