In [1]:
import pandas as pd
import os
from pathlib import Path
from datetime import datetime
from tqdm import tqdm

# 定义文件夹路径
folder_path = './NamedReports'
# 定义输出文件路径
output_file = './combined_results.csv'
# 定义处理日志Excel文件路径
log_file = './processing_log.xlsx'
# 定义当前时间和用户
CURRENT_TIME = "2025-02-28 21:41:05"
CURRENT_USER = "AdaJSY"

def extract_site_id(df):
    site_id = df.iloc[1, 1]
    return site_id

def create_processing_log(processed_files):
    """创建处理日志Excel文件"""
    log_df = pd.DataFrame(processed_files)
    log_df = log_df.sort_values('Processing_Time', ascending=False)
    log_df.to_excel(log_file, index=False)

def is_valid_file(file_path):
    """检查文件是否为有效的Excel文件（非隐藏文件）"""
    return (not file_path.name.startswith('.') and 
            file_path.suffix.lower() == '.xls')

def process_single_file(file_path):
    try:
        # 读取 HTML 文件中的所有表格
        data_to_extract_html = pd.read_html(file_path)
        site_id = extract_site_id(data_to_extract_html[0])
        
        # 获取第5个表格
        original_df = data_to_extract_html[5]
        
        # 获取列的多级索引值
        level0_values = original_df.columns.get_level_values(0)
        level1_values = original_df.columns.get_level_values(1)
        level2_values = original_df.columns.get_level_values(2)
        
        # 创建新的DataFrame
        new_data = [level0_values, level1_values, level2_values]
        new_data.extend(original_df.values)
        df = pd.DataFrame(new_data)
        
        # 检查第一列，找出以 "All" 开头且以 "bound" 结尾的行的行索引
        all_bound_rows_indices = df[df.iloc[:, 0].str.startswith('All', na=False) & 
                                  df.iloc[:, 0].str.endswith('bound', na=False)].index
        
        # 提取bound_texts
        bound_texts = df.iloc[all_bound_rows_indices, 0].values
        
        # 拆分表格
        dfs = {}
        for i in range(len(all_bound_rows_indices) - 1):
            start_idx = all_bound_rows_indices[i]
            end_idx = all_bound_rows_indices[i + 1]
            df_section = df.iloc[start_idx:end_idx].reset_index(drop=True)
            dfs[bound_texts[i]] = df_section
        
        # 处理最后一个部分
        df_section_last = df.iloc[all_bound_rows_indices[-1]:].reset_index(drop=True)
        dfs[bound_texts[-1]] = df_section_last
        
        # 提取时间行及其上一行
        for name, table in dfs.items():
            table.iloc[:, 0] = table.iloc[:, 0].fillna('')
            time_rows = table[table.iloc[:, 0].str.match(r'^\d{2}:\d{2}:\d{2}$')].index
            
            if min(time_rows) > 0:
                prev_row = [min(time_rows) - 1]
            else:
                prev_row = []
            
            rows_to_keep = sorted(list(set(time_rows) | set(prev_row)))
            updated_table = table.iloc[rows_to_keep].reset_index(drop=True)
            updated_table.iloc[0, 0] = "Time"
            dfs[name] = updated_table
        
        # 删除特定列
        for name, table in dfs.items():
            for col in table.columns:
                if table[col].iloc[0] in ["Workday", "7 Day", "Count"]:
                    table = table.drop(columns=[col])
            dfs[name] = table
        
        # 转换为长格式
        final_dfs = []
        for name, table in dfs.items():
            table.columns = table.iloc[0]
            table = table.iloc[1:]
            table = table.reset_index(drop=True)
            
            time_column = table.columns[0]
            long_format_table = pd.melt(table,
                                      id_vars=[time_column],
                                      var_name='Date',
                                      value_name='MeltedValue')
            
            # 添加标识列
            long_format_table.insert(0, 'Bound_Category', name)
            long_format_table.insert(0, 'Site_ID', site_id)
            
            final_dfs.append(long_format_table)
        
        # 合并该文件的所有数据
        return pd.concat(final_dfs, ignore_index=True), True, "Success"
    
    except Exception as e:
        return None, False, str(e)

def main():
    start_time = datetime.now()
    
    # 创建一个空的列表来存储所有文件的数据
    processing_log = []
    
    # 获取文件夹中的所有有效.xls文件
    xls_files = [f for f in Path(folder_path).glob('*.xls') if is_valid_file(f)]
    total_files = len(xls_files)
    
    print(f"Starting to process {total_files} files...")
    
    # 首先创建CSV文件并写入头部
    first_file = True
    
    # 使用tqdm创建进度条
    for file_path in tqdm(xls_files, desc="Processing files", unit="file"):
        processing_time = CURRENT_TIME
        
        result_df, success, message = process_single_file(str(file_path))
        
        # 记录处理信息
        log_entry = {
            'File_Name': file_path.name,
            'Processing_Time': processing_time,
            'Status': 'Success' if success else 'Failed',
            'Message': message,
            'File_Size_KB': round(file_path.stat().st_size / 1024, 2),
            'Processed_By': CURRENT_USER
        }
        processing_log.append(log_entry)
        
        if success and result_df is not None:
            # 将处理后的数据直接写入CSV文件
            result_df.to_csv(output_file, 
                           mode='w' if first_file else 'a',
                           header=first_file,
                           index=False)
            first_file = False
    
    # 创建处理日志Excel文件
    create_processing_log(processing_log)
    
    # 计算总处理时间
    end_time = datetime.now()
    processing_duration = end_time - start_time
    
    # 打印处理统计信息
    success_count = sum(1 for log in processing_log if log['Status'] == 'Success')
    print("\nProcessing Summary:")
    print(f"Total files: {total_files}")
    print(f"Successfully processed: {success_count}")
    print(f"Failed: {total_files - success_count}")
    print(f"Total processing time: {processing_duration}")
    print(f"Average time per file: {processing_duration/total_files}")
    
    # 显示文件大小信息
    if os.path.exists(output_file):
        file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
        print(f"\nOutput file size: {file_size_mb:.2f} MB")
        print(f"Results have been saved to: {output_file}")
        print("\nTo read this CSV file later, you can use:")
        print("df = pd.read_csv('combined_results.csv')")

if __name__ == "__main__":
    main()

Starting to process 0 files...


Processing files: 0file [00:00, ?file/s]


KeyError: 'Processing_Time'

In [4]:
import pandas as pd
import os
from pathlib import Path
from datetime import datetime
from tqdm import tqdm

# 定义常量
CURRENT_TIME = "2025-02-28 21:48:27"
CURRENT_USER = "AdaJSY"

# 定义文件路径
folder_path = './NamedReports'
log_file = './processing_log.xlsx'
output_file = './combined_results_failed.csv'
failed_log_file = './failed_processing_log.xlsx'

def get_failed_files():
    """从处理日志中获取失败的文件列表"""
    if not os.path.exists(log_file):
        print("处理日志文件不存在！")
        return []
    
    log_df = pd.read_excel(log_file)
    failed_files = log_df[log_df['Status'] == 'Failed']['File_Name'].tolist()
    return failed_files

def process_single_file(file_path):
    try:
        # 读取 HTML 文件中的所有表格
        data_to_extract_html = pd.read_html(file_path)
        
        # 检查是否有足够的表格
        if len(data_to_extract_html) < 6:
            return None, False, f"文件包含的表格数量不足: {len(data_to_extract_html)}"
            
        try:
            site_id = extract_site_id(data_to_extract_html[0])
        except Exception as e:
            return None, False, f"提取Site ID失败: {str(e)}"
            
        try:
            # 获取第5个表格
            original_df = data_to_extract_html[5]
        except Exception as e:
            return None, False, f"获取第5个表格失败: {str(e)}"
        
        # 获取列的多级索引值
        level0_values = original_df.columns.get_level_values(0)
        level1_values = original_df.columns.get_level_values(1)
        level2_values = original_df.columns.get_level_values(2)
        
        # 创建新的DataFrame
        new_data = [level0_values, level1_values, level2_values]
        new_data.extend(original_df.values)
        df = pd.DataFrame(new_data)
        
        # 检查第一列，找出以 "All" 开头且以 "bound" 结尾的行的行索引
        all_bound_rows_indices = df[df.iloc[:, 0].str.startswith('All', na=False) & 
                                  df.iloc[:, 0].str.endswith('bound', na=False)].index
        
        if len(all_bound_rows_indices) == 0:
            return None, False, "没有找到bound行"
            
        # 提取bound_texts
        bound_texts = df.iloc[all_bound_rows_indices, 0].values
        
        # 拆分表格
        dfs = {}
        for i in range(len(all_bound_rows_indices) - 1):
            start_idx = all_bound_rows_indices[i]
            end_idx = all_bound_rows_indices[i + 1]
            df_section = df.iloc[start_idx:end_idx].reset_index(drop=True)
            dfs[bound_texts[i]] = df_section
        
        # 处理最后一个部分
        df_section_last = df.iloc[all_bound_rows_indices[-1]:].reset_index(drop=True)
        dfs[bound_texts[-1]] = df_section_last
        
        # 提取时间行及其上一行
        for name, table in dfs.items():
            table.iloc[:, 0] = table.iloc[:, 0].fillna('')
            time_rows = table[table.iloc[:, 0].str.match(r'^\d{2}:\d{2}:\d{2}$')].index
            
            if len(time_rows) == 0:
                return None, False, f"在 {name} 中没有找到时间行"
                
            if min(time_rows) > 0:
                prev_row = [min(time_rows) - 1]
            else:
                prev_row = []
            
            rows_to_keep = sorted(list(set(time_rows) | set(prev_row)))
            updated_table = table.iloc[rows_to_keep].reset_index(drop=True)
            updated_table.iloc[0, 0] = "Time"
            dfs[name] = updated_table
        
        # 删除特定列
        for name, table in dfs.items():
            columns_to_drop = []
            for col in table.columns:
                if table[col].iloc[0] in ["Workday", "7 Day", "Count"]:
                    columns_to_drop.append(col)
            if columns_to_drop:
                table = table.drop(columns=columns_to_drop)
            dfs[name] = table
        
        # 转换为长格式
        final_dfs = []
        for name, table in dfs.items():
            try:
                table.columns = table.iloc[0]
                table = table.iloc[1:]
                table = table.reset_index(drop=True)
                
                time_column = table.columns[0]
                long_format_table = pd.melt(table,
                                          id_vars=[time_column],
                                          var_name='Date',
                                          value_name='MeltedValue')
                
                # 添加标识列
                long_format_table.insert(0, 'Bound_Category', name)
                long_format_table.insert(0, 'Site_ID', site_id)
                
                final_dfs.append(long_format_table)
            except Exception as e:
                return None, False, f"处理表格 {name} 时发生错误: {str(e)}"
        
        if not final_dfs:
            return None, False, "没有成功处理的表格"
            
        # 合并该文件的所有数据
        return pd.concat(final_dfs, ignore_index=True), True, "Success"
    
    except Exception as e:
        detailed_error = f"处理文件时发生错误: {str(e)}"
        print(f"Error processing {file_path}: {detailed_error}")
        return None, False, detailed_error

def extract_site_id(df):
    """提取Site ID，增加错误处理"""
    try:
        # 尝试多种可能的位置
        possible_positions = [(1, 1), (1, 0), (0, 1)]
        for row, col in possible_positions:
            try:
                value = df.iloc[row, col]
                if isinstance(value, str) and value.isdigit():
                    return value
            except:
                continue
        
        # 如果上述方法都失败，尝试扫描整个DataFrame
        for i in range(min(5, len(df))):
            for j in range(min(5, len(df.columns))):
                value = str(df.iloc[i, j])
                if value.isdigit() and len(value) > 8:  # Site ID通常较长
                    return value
        
        raise ValueError("无法找到有效的Site ID")
    except Exception as e:
        raise ValueError(f"提取Site ID时出错: {str(e)}")

def main():
    start_time = datetime.now()
    
    # 获取失败的文件列表
    failed_files = get_failed_files()
    if not failed_files:
        print("没有找到失败的文件记录")
        return
    
    print(f"Found {len(failed_files)} failed files to reprocess...")
    
    # 创建新的处理日志
    processing_log = []
    
    # 设置CSV文件的追加模式
    append_mode = os.path.exists(output_file)
    
    # 处理失败的文件
    for file_name in tqdm(failed_files, desc="Reprocessing failed files"):
        file_path = Path(folder_path) / file_name
        
        if not file_path.exists():
            print(f"文件不存在: {file_path}")
            continue
            
        processing_time = CURRENT_TIME
        
        result_df, success, message = process_single_file(str(file_path))
        
        # 记录处理信息
        log_entry = {
            'File_Name': file_name,
            'Processing_Time': processing_time,
            'Status': 'Success' if success else 'Failed',
            'Message': message,
            'File_Size_KB': round(file_path.stat().st_size / 1024, 2),
            'Processed_By': CURRENT_USER,
            'Reprocess_Attempt': True
        }
        processing_log.append(log_entry)
        
        if success and result_df is not None:
            # 将处理后的数据追加到现有CSV文件
            result_df.to_csv(output_file, 
                           mode='a',
                           header=not append_mode,
                           index=False)
            append_mode = True
    
    # 保存新的处理日志
    if processing_log:
        pd.DataFrame(processing_log).to_excel(failed_log_file, index=False)
    
    # 计算处理时间
    end_time = datetime.now()
    processing_duration = end_time - start_time
    
    # 打印统计信息
    success_count = sum(1 for log in processing_log if log['Status'] == 'Success')
    print("\nReprocessing Summary:")
    print(f"Total failed files attempted: {len(failed_files)}")
    print(f"Successfully reprocessed: {success_count}")
    print(f"Failed again: {len(failed_files) - success_count}")
    print(f"Total reprocessing time: {processing_duration}")
    
    # 显示文件大小信息
    if os.path.exists(output_file):
        file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
        print(f"\nUpdated output file size: {file_size_mb:.2f} MB")

if __name__ == "__main__":
    main()

Found 849 failed files to reprocess...


Reprocessing failed files:   0%|          | 3/849 [00:00<00:33, 24.93it/s]

Error processing NamedReports/000000899717_2023-06-01_2023-06-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679922_2024-10-01_2024-10-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679109_2023-09-01_2023-09-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001215524_2023-09-01_2023-09-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679921_2023-03-01_2023-03-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   1%|          | 9/849 [00:00<00:40, 20.96it/s]

Error processing NamedReports/000001390418_2024-12-01_2024-12-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001215468_2023-12-01_2023-12-11.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679006_2023-06-01_2023-06-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679006_2024-04-01_2024-04-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001215498_2023-11-01_2023-11-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   1%|▏         | 12/849 [00:00<00:39, 20.96it/s]

Error processing NamedReports/000000631192_2023-10-01_2023-10-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679922_2023-07-01_2023-07-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001359200_2023-11-01_2023-11-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000899305_2024-10-01_2024-10-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   2%|▏         | 15/849 [00:00<00:40, 20.71it/s]

Error processing NamedReports/000000893354_2024-07-01_2024-07-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000830209_2024-05-01_2024-05-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001215463_2023-01-01_2023-01-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001215474_2025-01-01_2025-01-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   2%|▏         | 18/849 [00:00<00:46, 18.04it/s]

Error processing NamedReports/000001359427_2023-04-01_2023-04-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   3%|▎         | 22/849 [00:01<00:44, 18.70it/s]

Error processing NamedReports/000001215524_2024-02-01_2024-02-27.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000899171_2023-08-01_2023-08-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001215463_2023-06-01_2023-06-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001359528_2023-10-01_2023-10-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   3%|▎         | 26/849 [00:01<00:45, 18.06it/s]

Error processing NamedReports/000000970107_2023-08-01_2023-08-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000002650163_2025-01-01_2025-01-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000893332_2023-11-01_2023-11-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001910187_2023-10-01_2023-10-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000899012_2023-12-01_2023-12-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   3%|▎         | 29/849 [00:01<00:42, 19.09it/s]

Error processing NamedReports/000001210795_2023-11-01_2023-11-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000899171_2024-07-01_2024-07-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679006_2023-08-01_2023-08-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   4%|▍         | 32/849 [00:01<00:38, 21.13it/s]

Error processing NamedReports/000000679921_2024-02-01_2024-02-29.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001270312_2024-08-01_2024-08-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   4%|▍         | 35/849 [00:01<00:38, 21.00it/s]

Error processing NamedReports/000000519818_2024-01-01_2024-01-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679921_2024-04-01_2024-04-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000510390_2023-02-23_2023-02-28.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   4%|▍         | 38/849 [00:01<00:36, 22.31it/s]

Error processing NamedReports/000001359200_2024-04-01_2024-04-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001219178_2024-04-01_2024-04-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001210458_2024-12-01_2024-12-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001750254_2024-04-01_2024-04-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000893332_2024-10-01_2024-10-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   5%|▍         | 41/849 [00:02<00:39, 20.71it/s]

Error processing NamedReports/000000679921_2023-10-01_2023-10-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000899012_2023-07-01_2023-07-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   5%|▌         | 44/849 [00:02<00:36, 22.03it/s]

Error processing NamedReports/000000899305_2023-08-01_2023-08-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679109_2023-10-01_2023-10-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679006_2023-10-01_2023-10-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001350298_2024-07-02_2024-07-25.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   6%|▌         | 47/849 [00:02<00:36, 22.19it/s]

Error processing NamedReports/000001215969_2024-06-01_2024-06-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   6%|▌         | 50/849 [00:02<00:40, 19.68it/s]

Error processing NamedReports/000001210516_2023-09-01_2023-09-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000899171_2025-01-01_2025-01-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001215468_2023-11-01_2023-11-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   6%|▌         | 53/849 [00:02<00:39, 20.08it/s]

Error processing NamedReports/000000519818_2023-11-01_2023-11-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000519816_2024-03-01_2024-03-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   7%|▋         | 56/849 [00:02<00:39, 20.28it/s]

Error processing NamedReports/000001219420_2024-01-01_2024-01-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679921_2023-11-01_2023-11-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001270312_2024-03-01_2024-03-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001359200_2024-09-01_2024-09-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679109_2023-12-01_2023-12-16.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   7%|▋         | 59/849 [00:02<00:38, 20.79it/s]

Error processing NamedReports/000001510412_2024-06-01_2024-06-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000893354_2024-11-01_2024-11-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   7%|▋         | 62/849 [00:03<00:41, 18.87it/s]

Error processing NamedReports/000001215546_2024-03-01_2024-03-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000970107_2023-02-01_2023-02-28.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000893058_2024-09-01_2024-09-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   8%|▊         | 66/849 [00:03<00:44, 17.49it/s]

Error processing NamedReports/000002650163_2023-12-01_2023-12-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000519816_2023-11-01_2023-11-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001215524_2024-11-01_2024-11-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000899105_2023-05-01_2023-05-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000519818_2024-04-01_2024-04-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   8%|▊         | 69/849 [00:03<00:41, 18.58it/s]

Error processing NamedReports/000001215474_2023-12-01_2023-12-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001210458_2023-12-01_2023-12-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   8%|▊         | 71/849 [00:03<00:44, 17.62it/s]

Error processing NamedReports/000001215524_2024-03-02_2024-03-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001215419_2024-03-01_2024-03-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   9%|▊         | 73/849 [00:03<00:45, 16.93it/s]

Error processing NamedReports/000001215969_2024-11-01_2024-11-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   9%|▉         | 75/849 [00:03<00:48, 16.01it/s]

Error processing NamedReports/000001215969_2024-04-01_2024-04-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001510412_2025-02-01_2025-02-19.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   9%|▉         | 77/849 [00:04<00:48, 15.85it/s]

Error processing NamedReports/000001750254_2024-03-01_2024-03-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001910187_2023-12-01_2023-12-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:   9%|▉         | 79/849 [00:04<00:46, 16.64it/s]

Error processing NamedReports/000001359427_2024-12-01_2024-12-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001210795_2023-04-01_2023-04-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001210458_2024-10-01_2024-10-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:  10%|▉         | 81/849 [00:04<00:47, 16.23it/s]

Error processing NamedReports/000001215474_2023-09-01_2023-09-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:  10%|▉         | 84/849 [00:04<00:42, 18.20it/s]

Error processing NamedReports/000000899012_2024-11-01_2024-11-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000893383_2023-02-01_2023-02-24.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001219804_2024-07-01_2024-07-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001215524_2024-12-01_2024-12-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:  10%|█         | 86/849 [00:04<00:41, 18.30it/s]

Error processing NamedReports/000000899305_2023-07-01_2023-07-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:  10%|█         | 89/849 [00:04<00:35, 21.17it/s]

Error processing NamedReports/000001530332_2023-11-01_2023-11-30.xls: 处理文件时发生错误: no text parsed from document (line 0)
Error processing NamedReports/000001359427_2024-03-01_2024-03-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001390418_2024-03-01_2024-03-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001870067_2023-09-01_2023-09-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001359427_2024-01-01_2024-01-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:  11%|█         | 95/849 [00:04<00:33, 22.53it/s]

Error processing NamedReports/000001510412_2024-08-01_2024-08-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000893332_2024-01-01_2024-01-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001219438_2024-01-01_2024-01-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001359200_2024-07-01_2024-07-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000899907_2025-02-01_2025-02-19.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000519818_2024-08-01_2024-08-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:  12%|█▏        | 101/849 [00:05<00:35, 21.03it/s]

Error processing NamedReports/000001210516_2023-03-01_2023-03-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000830209_2024-02-01_2024-02-29.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001510412_2024-11-01_2024-11-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000899171_2024-09-01_2024-09-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679109_2024-04-01_2024-04-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:  13%|█▎        | 107/849 [00:05<00:35, 21.18it/s]

Error processing NamedReports/000001215498_2024-02-01_2024-02-29.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000899171_2023-10-01_2023-10-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000899012_2023-04-01_2023-04-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001215474_2024-05-01_2024-05-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001210458_2023-02-02_2023-02-28.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:  13%|█▎        | 113/849 [00:05<00:32, 22.42it/s]

Error processing NamedReports/000001219912_2023-01-01_2023-01-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000510390_2024-06-01_2024-06-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679109_2023-08-01_2023-08-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000899171_2024-06-01_2024-06-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001219804_2023-11-01_2023-11-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679921_2024-09-01_2024-09-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:  14%|█▎        | 116/849 [00:05<00:33, 21.95it/s]

Error processing NamedReports/000000519818_2024-06-01_2024-06-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000519215_2023-05-01_2023-05-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000631192_2023-12-01_2023-12-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001750254_2024-07-01_2024-07-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:  14%|█▍        | 122/849 [00:06<00:31, 23.34it/s]

Error processing NamedReports/000000679006_2023-01-01_2023-01-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679109_2024-08-01_2024-08-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001215508_2023-03-01_2023-03-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001219013_2024-12-11_2024-12-31.xls: 处理文件时发生错误: no text parsed from document (line 0)
Error processing NamedReports/000000519215_2023-06-01_2023-06-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000519215_2023-02-23_2023-02-28.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001219420_2024-02-01_2024-02-29.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:  15%|█▌        | 129/849 [00:06<00:29, 24.39it/s]

Error processing NamedReports/000000519215_2023-09-01_2023-09-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001219438_2023-01-01_2023-01-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001350241_2025-02-01_2025-02-19.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000002650163_2024-10-01_2024-10-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001219420_2024-08-01_2024-08-13.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:  16%|█▌        | 132/849 [00:06<00:32, 22.16it/s]

Error processing NamedReports/000001215419_2023-07-01_2023-07-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000210334_2024-07-01_2024-07-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000899717_2023-01-01_2023-01-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000001215463_2023-04-01_2023-04-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000899171_2023-06-01_2023-06-30.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2


Reprocessing failed files:  16%|█▋        | 138/849 [00:06<00:34, 20.32it/s]

Error processing NamedReports/000001219912_2024-10-01_2024-10-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000519215_2024-08-01_2024-08-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000679921_2024-08-01_2024-08-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2
Error processing NamedReports/000000519816_2024-12-01_2024-12-31.xls: 处理文件时发生错误: Too many levels: Index has only 1 level, not 2





KeyboardInterrupt: 

In [5]:
# 在Jupyter中查看文件内容
import pandas as pd

# 选择一个失败的文件
file_path = './NamedReports/000000899717_2023-06-01_2023-06-30.xls'

# 读取文件中的所有表格
tables = pd.read_html(file_path)

# 看看第5个表格（索引为5）的结构
print("表格数量:", len(tables))
print("\n第5个表格的列结构:")
print(tables[5].columns)
print("\n第5个表格的前几行:")
print(tables[5].head())

表格数量: 8

第5个表格的列结构:
Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='int64')

第5个表格的前几行:
           0   1                              2   \
0  Event key: NaN                     QC Failure   
1         NaN NaN  Weekends and defined holidays   

                              3                              4   \
0                            NaN                     QC Outlier   
1  Weekends and defined holidays  Weekends and defined holidays   

                              5                              6   7   \
0                            NaN                    QC Atypical NaN   
1  Weekends and defined holidays  Weekends and defined holidays NaN   

                      8                      9                      10  11  \
0                 Events                    NaN                Special NaN   
1  Holiday-affected days  Holiday-affected days  Holiday-affected days NaN   

        12  13       14  
0  Holiday NaN  Offline  
1      NaN NaN      NaN  


In [6]:
print(tables[5])

           0   1                              2   \
0  Event key: NaN                     QC Failure   
1         NaN NaN  Weekends and defined holidays   

                              3                              4   \
0                            NaN                     QC Outlier   
1  Weekends and defined holidays  Weekends and defined holidays   

                              5                              6   7   \
0                            NaN                    QC Atypical NaN   
1  Weekends and defined holidays  Weekends and defined holidays NaN   

                      8                      9                      10  11  \
0                 Events                    NaN                Special NaN   
1  Holiday-affected days  Holiday-affected days  Holiday-affected days NaN   

        12  13       14  
0  Holiday NaN  Offline  
1      NaN NaN      NaN  
