In [1]:
import re
import pandas as pd

# 定义正则表达式模式
train_pattern = re.compile(
    r"Train: Iter=(\d+) \(([\d\.]+) epochs\) train_loss=([\d\.]+) iter_dt=([\d\.e\-]+)s lr=([\d\.e\-]+)"
)
eval_pattern = re.compile(
    r">Eval: Iter=(\d+) \(([\d\.]+) epochs\) val_loss=([\d\.]+) val_pp=([\d\.]+) val_acc=([\d\.]+)"
)

# 初始化空列表存储数据
train_data = []
eval_data = []


In [4]:
# 读取 output.log 文件
with open('/home/chenyupeng/yupeng/jaggi-lr/wandb2csv/output.log', 'r') as f:
    for line in f:
        # 匹配训练数据行
        train_match = train_pattern.match(line)
        if train_match:
            iter_num = int(train_match.group(1))
            epochs = float(train_match.group(2))
            train_loss = float(train_match.group(3))
            iter_dt = float(train_match.group(4))
            lr = float(train_match.group(5))
            train_data.append({
                'iter': iter_num,
                'epochs': epochs,
                'train_loss': train_loss,
                'iter_dt': iter_dt,
                'lr': lr
            })
            continue

        # 匹配验证数据行
        eval_match = eval_pattern.match(line)
        if eval_match:
            iter_num = int(eval_match.group(1))
            epochs = float(eval_match.group(2))
            val_loss = float(eval_match.group(3))
            val_pp = float(eval_match.group(4))
            val_acc = float(eval_match.group(5))
            eval_data.append({
                'iter': iter_num,
                'epochs': epochs,
                'val_loss': val_loss,
                'val_pp': val_pp,
                'val_acc': val_acc
            })

In [5]:


# 将数据转换为 DataFrame
train_df = pd.DataFrame(train_data)
eval_df = pd.DataFrame(eval_data)

# 合并训练和验证数据
merged_df = pd.merge(train_df, eval_df, on=['iter', 'epochs'], how='outer')

# 按迭代次数排序
merged_df.sort_values(by='iter', inplace=True)

# 将缺失值填充为前一个有效值（如果需要）
merged_df.fillna(method='ffill', inplace=True)

# 保存为 CSV 文件
merged_df.to_csv('/home/chenyupeng/yupeng/jaggi-lr/wandb2csv/training_results.csv', index=False)
print("数据已成功提取并保存为 training_results.csv")


数据已成功提取并保存为 training_results.csv


  merged_df.fillna(method='ffill', inplace=True)
