In [1]:
import re
import csv
import os
import glob

# 指定日志文件夹路径
log_dir = '/home/chenyupeng/yupeng/jaggi-lr/wandb2csv/logs_wsd'  # 请根据实际情况修改



print("Checking directory:", log_dir)
print("Directory exists?", os.path.exists(log_dir))
print("Files in directory:", os.listdir(log_dir))

log_files = glob.glob(os.path.join(log_dir, '*.log'))
print("Matched log files:", log_files)


# 已处理文件记录文件，如果不存在就会新建
processed_record_file = os.path.join(log_dir, 'processed_logs.txt')

# 读取已处理文件列表
processed_files = set()
if os.path.exists(processed_record_file):
    with open(processed_record_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                processed_files.add(line)

# 获取指定目录下的所有日志文件
log_files = glob.glob(os.path.join(log_dir, '*.log'))

print(log_files)

# 正则表达式模式
experiment_pattern = r'^Starting Experiment:\s*(.+)$'
eval_pattern = r'^>Eval:\s*Iter=(\d+)\s*\(([\d.]+)\s*epochs\)\s*val_loss=([\d.]+)\s*val_pp=([\d.]+)\s*val_acc=([\d.]+)'


for log_file_path in log_files:
    log_file_name = os.path.basename(log_file_path)
    
    # 如果该日志文件已处理过，则跳过
    if log_file_name in processed_files:
        print(f"文件 {log_file_name} 已处理过，跳过。")
        continue

    # 初始化变量
    experiment_name = ""
    eval_data = []
    
    # 读取日志文件并解析
    with open(log_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            
            # 提取实验名称
            if not experiment_name:
                match_exp = re.match(experiment_pattern, line)
                if match_exp:
                    experiment_name = match_exp.group(1)
                    continue
            
            # 提取评估数据
            match_eval = re.match(eval_pattern, line)
            if match_eval:
                iter_num = match_eval.group(1)
                epoch = match_eval.group(2)
                val_loss = match_eval.group(3)
                val_pp = match_eval.group(4)
                val_acc = match_eval.group(5)
                
                eval_data.append({
                    'Iter': iter_num,
                    'Epoch': epoch,
                    'val_loss': val_loss,
                    'val_pp': val_pp,
                    'val_acc': val_acc
                })
    
    # 检查是否提取到实验名称和评估数据
    if not experiment_name:
        print(f"未能在日志文件 {log_file_name} 中找到实验名称。")
        # 不写入记录，保留文件下次可能还要重新检查
        continue

    if not eval_data:
        print(f"未能在日志文件 {log_file_name} 中找到评估数据。")
        # 同样不记录为已处理
        continue

    # 生成安全的文件名
    safe_experiment_name = re.sub(r'[\\/:"*?<>|]+', '_', experiment_name)

    # 定义输出的 CSV 文件路径（同样可指定目录）
    csv_file_name = f"{safe_experiment_name}_test.csv"
    csv_file_path = os.path.join(log_dir, csv_file_name)

    # 写入 CSV 文件
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['Iter', 'Epoch', 'val_loss', 'val_pp', 'val_acc'])
        writer.writeheader()
        for data in eval_data:
            writer.writerow(data)

    print(f"CSV 文件已成功生成：{csv_file_path}")

    # 将该日志文件记录为已处理
    with open(processed_record_file, 'a', encoding='utf-8') as f:
        f.write(log_file_name + '\n')


Checking directory: /home/chenyupeng/yupeng/jaggi-lr/wandb2csv/logs_wsd
Directory exists? True
Files in directory: ['slimpajama_llama_nlayers8_nhead6_lr0.002_sched_wsd_warmup0_decay_linear_0.0_iter15000_bs50x4_ws1_seed0_data_seed1337.log', 'slimpajama_llama_nlayers8_nhead6_lr0.002_sched_wsd_warmup0_decay_linear_0.4_iter15000_bs50x4_ws1_seed0_data_seed1337.log', '.ipynb_checkpoints', 'slimpajama_llama_nlayers8_nhead6_lr0.002_sched_cos_warmup1500_decay_linear_0.1_iter15000_bs50x4_ws1_seed0_data_seed1337.log', 'slimpajama_llama_nlayers8_nhead6_lr0.002_sched_cos_warmup1500_decay_linear_0.1_iter15000_bs50x4_ws1_seed0_data_seed1337_test.csv', 'slimpajama_llama_nlayers8_nhead6_lr0.002_sched_wsd_warmup0_decay_linear_0.1_iter15000_bs50x4_ws1_finalrfactor0.1_seed0_data_seed1337.log', 'slimpajama_llama_nlayers8_nhead6_lr0.002_sched_wsd_warmup0_decay_linear_0.1_iter15000_bs50x4_ws1_seed0_data_seed1337.log', 'slimpajama_llama_nlayers8_nhead6_lr0.002_sched_wsd_warmup1500_decay_linear_0.0_iter15000_b