In [15]:
import os
import re
from datetime import datetime

# Define the paths
input_folder = os.path.relpath(r'..\..\texts\毛泽东文集')
output_folder = os.path.relpath(r'..\..\corpus\mao')

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Define a function to convert Chinese date to "dd-mm-yyyy"
def convert_chinese_date(chinese_date):
    chinese_numerals = {
        '零': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10
    }
    
    def convert_part(part):
        if len(part) == 1:
            return str(chinese_numerals[part])
        elif len(part) == 2:
            if part[0] == '十':
                return str(10 + chinese_numerals[part[1]])
            elif part[1] == '十':
                return str(chinese_numerals[part[0]] * 10)
            else:
                return str(chinese_numerals[part[0]]) + str(chinese_numerals[part[1]])
        elif len(part) == 3:
            if part[1] == '十':
                return str(chinese_numerals[part[0]] * 10 + chinese_numerals[part[2]])
            else:
                return ''.join(str(chinese_numerals[char]) for char in part)
        else:
            return ''.join(str(chinese_numerals[char]) for char in part)
    
    year = convert_part(chinese_date[:4])
    month_start = chinese_date.find('年') + 1
    month_end = chinese_date.find('月')
    month = convert_part(chinese_date[month_start:month_end])
    
    day_start = month_end + 1
    day_end = chinese_date.find('日')
    day = convert_part(chinese_date[day_start:day_end])
    
    date = datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d")
    return date.strftime("%d-%m-%Y")

# Loop through subfolders "1" to "8"
for i in range(1, 9):
    subfolder = os.path.join(input_folder, str(i))
    print(subfolder)
    if not os.path.exists(subfolder):
        continue
    
    # Loop through all text files in the subfolder
    for filename in os.listdir(subfolder):
        if filename.endswith(".txt"):
            filepath = os.path.join(subfolder, filename)
            
            with open(filepath, 'r', encoding='utf-8') as file:
                lines = file.readlines()
            
            # Find the first instance of the date
            date_pattern = re.compile(r'([零一二三四五六七八九]{4}年[零一二三四五六七八九十]{1,2}月[零一二三四五六七八九十]{1,3}日)')
            date_found = False
            new_lines = []
            for line in lines:
                if not date_found:
                    match = date_pattern.search(line)
                    if match:
                        chinese_date = match.group(1)
                        roman_date = convert_chinese_date(chinese_date)
                        date_found = True
                        new_lines.append(line)
                        continue
                new_lines.append(line)
            
            # Remove empty rows at the end plus one more row
            while new_lines and new_lines[-1].strip() == '':
                new_lines.pop()
            if new_lines:
                new_lines.pop()
            
            # Write the modified content to the new file
            new_filename = f"{roman_date}.txt"
            new_filepath = os.path.join(output_folder, new_filename)
            with open(new_filepath, 'w', encoding='utf-8') as new_file:
                new_file.writelines(new_lines)

..\..\texts\毛泽东文集\1
..\..\texts\毛泽东文集\2
..\..\texts\毛泽东文集\3
..\..\texts\毛泽东文集\4
..\..\texts\毛泽东文集\5
..\..\texts\毛泽东文集\6
..\..\texts\毛泽东文集\7
..\..\texts\毛泽东文集\8
