In [None]:
import os

def count_files_in_directory(directory_path):
    """
    统计指定目录下的文件数量。

    :param directory_path: 要检查的目录路径
    :return: 文件数量
    """
    try:
        # 获取目录下的所有文件和子目录
        entries = os.listdir(directory_path)
        
        # 统计文件数量
        file_count = sum(1 for entry in entries if os.path.isfile(os.path.join(directory_path, entry)))
        
        return file_count
    except FileNotFoundError:
        print(f"目录 {directory_path} 不存在")
        return 0
    except PermissionError:
        print(f"没有权限访问目录 {directory_path}")
        return 0

# 示例用法
directory_path = 'dataset/Top500'
file_count = count_files_in_directory(directory_path)
print(f"目录 {directory_path} 下的文件数量: {file_count}")

In [None]:
# test 获取数据
import os
import re
import csv

# 日志文件路径列表
# log_files = [
#     'logs/test_patchtst_28_28_28.log',
#     'logs/test_patchtst_56_28_28.log',
#     'logs/test_patchtst_56_56_56.log',
#     'logs/test_patchtst_84_56_56.log',
#     'logs/test_patchtst_84_84_84.log'
# ]

log_files = [
    'logs/test_patchtst_84_84.log'
]

# 正则表达式模式，用于匹配数据集名称 这里捕获的数据结果记得和和下面的路径匹配
# dataset_pattern = re.compile(r'Data Path:\s*(.+?)_all_ewma_span20.csv.*')
# dataset_pattern = re.compile(r'Data Path:\s*(.+?)_all.csv.*')
# dataset_pattern = re.compile(r'Data Path:\s*(.+?)_all_roll_ewma_span20.csv.*')
# dataset_pattern = re.compile(r'Data Path:\s*(.+?)_all_ewma_span28.csv.*')
# dataset_pattern = re.compile(r'Data Path:\s*(.+?)_all_roll.csv.*')
dataset_pattern = re.compile(r'Data Path:\s*(.+?)_all_roll_ewma_span28.csv.*')


# 正则表达式模式，用于匹配 MSE 和 MAE
metrics_pattern = re.compile(r'mse:\s*(nan|[0-9.e+-]+),\s*mae:\s*(nan|[0-9.e+-]+),\s*dtw:-999')

# 正则表达式模式，用于匹配 seq_len, label_len, pred_len
# config_pattern = re.compile(r'test_patchtst_(\d+)_(\d+)_(\d+)\.log')
# 下面这个用来匹配 84 84
config_pattern = re.compile(r'test_patchtst_(\d+)_(\d+)\.log')

# 逐个处理日志文件
for log_file_path in log_files:
    # 提取 seq_len, label_len, pred_len
    config_match = config_pattern.search(log_file_path)
    if config_match:
        # seq_len, label_len, pred_len = config_match.groups()
        label_len, pred_len = config_match.groups()
        # 输出文件路径记得修改 ！！！！！！！！！！！！！！！！！！！！！！！！！！！
        # csv_output_path = f'workresult/10dataset_roll_ewma/test_Result_adjustlr_{seq_len}_{label_len}_{pred_len}.csv'
        csv_output_path = f'workresult/10dataset_roll_ewma/test_Result_span28.csv'
    else:
        print(f"Warning: Could not extract configuration from {log_file_path}.")
        continue
    
    # 初始化数据集名称
    dataset_name = None

    # 打开 CSV 文件
    with open(csv_output_path, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        # 写入 CSV 文件的表头
        writer.writerow(['dataset', 'seq_len', 'label_len', 'pred_len', 'mse', 'mae'])
        
        # 打开日志文件
        with open(log_file_path, 'r') as log_file:
            # 逐行读取日志文件
            for line in log_file:
                # 匹配数据集名称
                dataset_match = dataset_pattern.search(line)
                if dataset_match:
                    dataset_name = dataset_match.group(1)
                
                # 匹配 MSE 和 MAE
                metrics_match = metrics_pattern.search(line)
                if metrics_match:
                    mse, mae = metrics_match.groups()
                    # 将 MSE 和 MAE 保留到小数点后三位
                    mse = f"{float(mse):.3f}"
                    mae = f"{float(mae):.3f}"
                    # 写入 CSV 文件
                    if dataset_name:
                        # writer.writerow([dataset_name, seq_len, label_len, pred_len, mse, mae])
                        writer.writerow([dataset_name, label_len, pred_len, mse, mae])
                        dataset_name = None
                    else:
                        print("Warning: Found metrics without a dataset name.")

    # print(f"Summary for {seq_len}_{label_len}_{pred_len} saved to {csv_output_path}")
    print(f"Summary for {label_len}_{pred_len} saved to {csv_output_path}")

Summary for 84_84 saved to workresult/10dataset_roll_ewma/test_Result_span28_84_84.csv


In [10]:
# 从多个日志文件中提取出mse信息
import csv
import re

repoName = ['vue', 'tensorflow', 'autogpt', 'kubernetes', 'terminal',
            'flutter', 'vscode', 'react-naive', 'electron', 'transformers']

# dataset_pattern = re.compile(r'Data Path:\s*(.+?)_all_roll_ewma_span28_normalize_Quartiles.csv*')
dataset_pattern = re.compile(r'Data Path:\s*(.+?)_all_roll_ewma_span28_normalize.csv*')

# 正则表达式模式，用于匹配 MSE 和 MAE
metrics_pattern = re.compile(r'mse:\s*(nan|[0-9.e+-]+),\s*mae:\s*(nan|[0-9.e+-]+),\s*dtw:-999')

for name in repoName:
    log_files = f'logs/{name}_patchtst_84_84.log'

    # csv_output_path = f'workresult/10dataset_roll_ewma_normalize/Result_span28_normalize_quartiles.csv'
    csv_output_path = f'workresult/10dataset_roll_ewma_normalize/Result_span28_normalize.csv'

    # 初始化数据集名称
    dataset_name = None

    # 检查文件是否为空
    file_is_empty = False
    try:
        with open(csv_output_path, 'r', newline='') as csv_file:
            file_is_empty = csv_file.read().strip() == ''
    except FileNotFoundError:
        file_is_empty = True

    # 打开 CSV 文件
    with open(csv_output_path, 'a', newline='') as csv_file:
        writer = csv.writer(csv_file)
        # 如果文件为空，则写入表头
        if file_is_empty:
            writer.writerow(['dataset', 'mse', 'mae'])
        
        # 打开日志文件
        with open(log_files, 'r') as log_file:
            # 逐行读取日志文件
            for line in log_file:
                # 匹配数据集名称
                dataset_match = dataset_pattern.search(line)
                if dataset_match:
                    dataset_name = dataset_match.group(1)
                
                # 匹配 MSE 和 MAE
                metrics_match = metrics_pattern.search(line)
                if metrics_match:
                    mse, mae = metrics_match.groups()
                    # 将 MSE 和 MAE 保留到小数点后三位
                    mse = f"{float(mse):.3f}"
                    mae = f"{float(mae):.3f}"
                    # 写入 CSV 文件
                    if dataset_name:
                        writer.writerow([dataset_name, mse, mae])
                        dataset_name = None
                    else:
                        print("Warning: Found metrics without a dataset name.")

    # print(f"Summary for {seq_len}_{label_len}_{pred_len} saved to {csv_output_path}")
    print("Summary")

Summary
Summary
Summary
Summary
Summary
Summary
Summary
Summary
Summary
Summary


In [1]:
# 文件夹结果统计
import csv
import re
import os

# 正则表达式模式，用于匹配 MSE 和 MAE
metrics_pattern = re.compile(r'mse:\s*(nan|[0-9.e+-]+),\s*mae:\s*(nan|[0-9.e+-]+),\s*dtw:-999')

dataset_pattern = re.compile(r'Data Path:\s*(.+?)_all_roll_ewma_span28_normalize.csv*')

# 获取 logs 文件夹下以 top 开头的文件
# log_files = [f for f in os.listdir('logs') if f.startswith('Top')]
# log_files = [f for f in os.listdir('logs') if f.startswith('Bottom')]
log_files = [f for f in os.listdir('logs') if f.startswith('language')]

# csv 输出路径
# csv_output_path = 'workresult/Top100_random_roll_ewma_normalize/Result.csv'
# csv_output_path = 'workresult/Bottom100_random_roll_ewma_normalize/Result.csv'
csv_output_path = 'workresult/language/Result.csv'

# 初始化数据集名称
dataset_name = None

# 检查文件是否为空
file_is_empty = False
try:
    with open(csv_output_path, 'r', newline='') as csv_file:
        file_is_empty = csv_file.read().strip() == ''
except FileNotFoundError:
    file_is_empty = True

# 打开 CSV 文件
with open(csv_output_path, 'a', newline='') as csv_file:
    writer = csv.writer(csv_file)
    # 如果文件为空，则写入表头
    if file_is_empty:
        writer.writerow(['dataset', 'mse', 'mae'])

    # 处理每个日志文件
    for log_file_name in log_files:
        log_file_path = os.path.join('logs', log_file_name)

        # 初始化数据集名称
        dataset_name = None

        # 打开日志文件
        with open(log_file_path, 'r') as log_file:
            # 逐行读取日志文件
            for line in log_file:
                # 匹配数据集名称
                dataset_match = dataset_pattern.search(line)
                if dataset_match:
                    dataset_name = dataset_match.group(1)
                
                # 匹配 MSE 和 MAE
                metrics_match = metrics_pattern.search(line)
                if metrics_match:
                    mse, mae = metrics_match.groups()
                    # 将 MSE 和 MAE 保留到小数点后三位
                    mse = f"{float(mse):.3f}" if mse != 'nan' else mse
                    mae = f"{float(mae):.3f}" if mae != 'nan' else mae
                    # 写入 CSV 文件
                    if dataset_name:
                        writer.writerow([dataset_name, mse, mae])
                        dataset_name = None
                    else:
                        print("Warning: Found metrics without a dataset name.")

print("Summary done")

Summary done


In [1]:
import pandas as pd

# 读取原始CSV文件
file_path = 'workresult/language/Result.csv'
df = pd.read_csv(file_path)

# 找出包含nan值的行
nan_rows = df[df.isnull().any(axis=1)]

# 提取被删除的行的dataset属性
deleted_datasets = nan_rows['dataset']

# 保存被删除的行的dataset属性到另一个CSV文件
deleted_datasets.to_csv('workresult/language/Deleted_Datasets.csv', index=False, header=['dataset'])

# 删除包含nan值的行
df_cleaned = df.dropna()

# 保存处理后的CSV文件
df_cleaned.to_csv('workresult/language/Cleaned_Result.csv', index=False)

In [5]:
# 验证哪个参数配置效果好
import os
import pandas as pd

def calculate_mse_mean(directory):
    # 获取目录下所有CSV文件
    # csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    # csv_files = [f for f in os.listdir(directory) if f.endswith('.csv') and 'adjustlr' in f]
    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv') and 'span28_84_84' in f]
    
    # 存储每个文件的mse均值
    mse_means = {}
    
    for file in csv_files:
        file_path = os.path.join(directory, file)
        
        try:
            # 读取CSV文件
            df = pd.read_csv(file_path)
            
            # 检查是否存在mse列
            if 'mse' in df.columns:
                # 计算mse列的均值
                mse_mean = df['mse'].mean()
                mse_means[file] = mse_mean
            else:
                print(f"文件 {file} 中没有 mse 列")
        
        except Exception as e:
            print(f"处理文件 {file} 时出错: {e}")
    
    return mse_means

# 指定目录
directory = 'workresult/10dataset_roll_ewma'

# 计算mse均值
mse_means = calculate_mse_mean(directory)

# 打印结果
for file, mean in mse_means.items():
    print(f"文件 {file} 的 mse 均值: {mean}")

# 计算所有文件的mse均值的均值
if mse_means:
    overall_mse_mean = sum(mse_means.values()) / len(mse_means)
    print(f"所有文件的 mse 均值的均值: {overall_mse_mean}")
else:
    print("没有找到符合条件的文件")

文件 test_Result_span28_84_84_84_localZscore.csv 的 mse 均值: 1.6369
文件 test_Result_span28_84_84.csv 的 mse 均值: 0.4318000000000001
所有文件的 mse 均值的均值: 1.03435


In [None]:
import re

text = "  Data Path:          996.ICU_all_ewma_span20.csvFeatures:           M    "
dataset_pattern = re.compile(r'Data Path:\s*(.+?)_all_ewma_span\d+\.csv')

match = dataset_pattern.search(text)
if match:
    print("匹配到的内容:", match.group(1))
else:
    print("未匹配到任何内容")


In [None]:
import os
import re
import csv

# 您的文件路径
your_file_path = 'workresult/Top500_all_ewma_Result.csv'
# 日志文件路径
log_file_path = 'logs/Top500_all_ewma_patchTST_48_48.log'

# 正则表达式模式，用于匹配数据集名称
dataset_pattern = re.compile(r'Data Path:\s*(.+?)_all_ewma_span\d+\.csv')

# 从您的文件中读取 dataset 列
def read_datasets_from_file(file_path):
    datasets = set()
    with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            dataset = row.get('dataset')
            if dataset:
                datasets.add(dataset)
    return datasets

# 从日志文件中提取 dataset 名称
def extract_datasets_from_log(file_path):
    datasets = set()
    with open(file_path, 'r', encoding='utf-8') as log_file:
        for line in log_file:
            line = line.strip()
            if not line:
                continue
            dataset_match = dataset_pattern.search(line)
            if dataset_match:
                dataset = dataset_match.group(1)
                datasets.add(dataset)
    return datasets

# 读取您的文件中的 dataset 名称
your_datasets = read_datasets_from_file(your_file_path)

# 从日志文件中提取 dataset 名称
log_datasets = extract_datasets_from_log(log_file_path)

# 找出您的文件中缺少的 dataset 名称
missing_datasets = log_datasets - your_datasets

# 打印缺少的 dataset 名称
if missing_datasets:
    print("以下 dataset 在您的文件中缺失：")
    for dataset in missing_datasets:
        print(dataset)
else:
    print("您的文件中包含了所有的 dataset。")

# 可选：将缺少的 dataset 名称保存到一个新的文件中
# output_file_path = 'missing_datasets.txt'
# with open(output_file_path, 'w', encoding='utf-8') as output_file:
#     for dataset in missing_datasets:
#         output_file.write(f"{dataset}\n")

# print(f"缺少的 dataset 名称已保存到 {output_file_path}")