In [27]:
import argparse
import os
import ast
import re
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi'] = 360
import seaborn as sns

In [28]:
def parse_log_file(log_path):
    rows = []

    dict_pattern = re.compile(r'(\{.*\})')

    with open(log_path, 'r') as f:
        for line in f:
            line = line.strip()
            match = dict_pattern.search(line)
            if match:
                dict_str = match.group(1)
                try:
                    data_dict = ast.literal_eval(dict_str)
                except (SyntaxError, ValueError):
                    continue

                timestamp = None
                parts = line.split(' - ')
                if len(parts) > 1:
                    timestamp = parts[0]

                row = {'timestamp': timestamp}
                for k, v in data_dict.items():
                    row[k] = v
                rows.append(row)

    if not rows:
        print(f"No valid dictionary lines found in {log_path}")
        return pd.DataFrame()

    df = pd.DataFrame(rows)
    return df

In [29]:
def random_subset_analysis(df, metric='r2', num_samples=5):
    if df.empty or metric not in df.columns:
        print(f"DataFrame is empty or missing column {metric}. Can't do random subset analysis.")
        return pd.DataFrame()

    subset_size = max(1, int(0.2 * len(df)))
    results = []

    for i in range(num_samples):
        subset = df.sample(subset_size, replace=False, random_state=np.random.randint(1e6))
        mean_value = subset[metric].mean()
        results.append({
            'sample_id': i,
            'subset_size': subset_size,
            f'mean_{metric}': mean_value
        })
    return pd.DataFrame(results)

In [44]:
log_path = '../job_management/logs/random_forest.log'
df = parse_log_file(log_path)
df.head()

Unnamed: 0,timestamp,train-rosette,train_size,r2,mse,pc
0,2025-01-27 12:21:17,combined,100%,,,
1,2025-01-27 12:29:14,combined,100%,,,
2,2025-01-27 13:07:35,combined,80.0%,0.901045,0.034044,0.94929
3,2025-01-27 13:31:20,combined,80.0%,0.929557,0.031279,0.964145
4,2025-01-27 13:32:05,combined,80.0%,0.929557,0.031279,0.964145


In [33]:
subset_df = random_subset_analysis(df, metric='r2', num_samples=5)
subset_df

Unnamed: 0,sample_id,subset_size,mean_r2
0,0,3,0.931998
1,1,3,0.921681
2,2,3,0.921681
3,3,3,0.921681
4,4,3,0.931185
