In [1]:
import json
import os
import glob
import re
import sys
from pathlib import Path

First, aggregate all results by temperature setting. This will loop through all temperatures and pull out the `*_results.json` files to then aggregate into a shared dictionary.

In [11]:
results_summary_dir = "/lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary"
model_name = f"checkpoint_0002250"

temperatures = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]

for temperature in temperatures:
    temp_for_folder = str(temperature).replace('.', '_')
    temp_results_file = f"{results_summary_dir}/results_temp_{temp_for_folder}.json"
    save_folder = f"/lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/am_offline_output_temp_{temp_for_folder}"

    # Read the empty results file
    with open(temp_results_file, 'r') as f:
        results = json.load(f)

    print(f"Looking for results in: {save_folder}/{model_name}")

    # Find all result files
    result_files = glob.glob(f"{save_folder}/{model_name}/*_results.json")
    if not result_files:
        # Try alternative patterns
        result_files = glob.glob(f"{save_folder}/{model_name}/*.json")
        result_files = [f for f in result_files if 'results' in f or 'eval' in f]

    print(f"Found {len(result_files)} result files for temperature {temperature}")

    # Now put all benchmark results files into the overall results dictionary for the temperature
    for result_file in result_files:
        try:
            # Extract leaderboard name from filename
            filename = os.path.basename(result_file)
            # Try to extract leaderboard name from various filename patterns
            leaderboard_match = re.search(r'(?:math|codegen|logic|table|simulation|stem|ood)__([^_]+)', filename)
            if leaderboard_match:
                leaderboard = leaderboard_match.group(1)
            else:
                # Fallback: use filename without extension
                leaderboard = os.path.splitext(filename)[0]
            
            print(f"Processing {leaderboard} results from {result_file} for temperature {temperature}")
            
            with open(result_file, 'r') as f:
                eval_results = json.load(f)
            
            # Extract key metrics - adapt based on your actual result structure
            metrics = {}
            if isinstance(eval_results, dict):
                # Common metric names to look for
                metric_keys = ['accuracy', 'score', 'pass_rate', 'success_rate', 'avg_score', 'mean_score']
                for key in eval_results:
                    if any(metric in key.lower() for metric in metric_keys):
                        metrics[key] = eval_results[key]
                    elif key in ['total_samples', 'num_correct', 'num_total']:
                        metrics[key] = eval_results[key]
            
            # If we couldn't find standard metrics, store the whole result
            if not metrics and eval_results:
                metrics = eval_results
            
            results['results'][leaderboard] = metrics
            
        except Exception as e:
            print(f"Error processing {result_file}: {e}")
            results['results'][leaderboard] = {"error": str(e)}

    # Write updated results
    with open(temp_results_file, 'w') as f:
        json.dump(results, f, indent=2)

    print(f"Results aggregated for temperature {temperature}")

Looking for results in: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/am_offline_output_temp_0_4/checkpoint_0002250
Found 6 result files for temperature 0.4
Processing aime results from /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/am_offline_output_temp_0_4/checkpoint_0002250/math__aime_repeated_8x_240_eval_results.json for temperature 0.4
Processing hitab results from /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/am_offline_output_temp_0_4/checkpoint_0002250/table__hitab_1k_eval_results.json for temperature 0.4
Processing mbpp results from /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/am_offline_output_temp_0_4/checkpoint_0002250/codegen__mbpp_500_eval_results.json for temperature 0.4
Processing zebra results from /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/am_offline_output_temp_0_4/checkpoint_0002250/logic__zebra_puzzle_dataset_200_eval_results.json for tem

Now, let's pull the fields we want from the aggregated, per temperature, results files to create a full summary file.

In [13]:
summary_csv = f"{results_summary_dir}/temperature_study_summary.csv"

print("Generating final comprehensive summary...")

# Collect all individual temperature results
all_results = []
result_files = glob.glob(f"{results_summary_dir}/results_temp_*.json")

for result_file in result_files:
    try:
        with open(result_file, 'r') as f:
            results = json.load(f)
        all_results.append(results)
    except Exception as e:
        print(f"Error reading {result_file}: {e}")

# Sort by temperature
all_results.sort(key=lambda x: x['temperature'])

# Create comprehensive summary
final_summary = {
    "study_type": "temperature_parameter_study",
    "total_temperatures": len(all_results),
    "temperature_range": [min(r['temperature'] for r in all_results), 
                        max(r['temperature'] for r in all_results)],
    "model_info": {
        "model_path": all_results[0]['model_path'] if all_results else "",
        "model_name": all_results[0]['model_name'] if all_results else ""
    },
    "leaderboards_tested": list(set().union(*[list(r['results'].keys()) for r in all_results])),
    "results_by_temperature": {str(r['temperature']): r for r in all_results}
}

# Write final summary
final_summary_file = f"{results_summary_dir}/final_temperature_study_summary.json"
with open(final_summary_file, 'w') as f:
    json.dump(final_summary, f, indent=2)

print(f"Final comprehensive summary written to: {final_summary_file}")
print(f"CSV summary available at: {summary_csv}")

# Generate a simple performance comparison
comparison_file = f"{results_summary_dir}/temperature_performance_comparison.txt"
with open(comparison_file, 'w') as f:
    f.write("TEMPERATURE PARAMETER STUDY PERFORMANCE COMPARISON\n")
    f.write("="*60 + "\n\n")
    
    for leaderboard in final_summary['leaderboards_tested']:
        f.write(f"Leaderboard: {leaderboard}\n")
        f.write("-" * 30 + "\n")
        
        for temp_str, temp_data in final_summary['results_by_temperature'].items():
            if leaderboard in temp_data['results']:
                metrics = temp_data['results'][leaderboard]
                f.write(f"Temperature {temp_str}: {metrics}\n")
        f.write("\n")

print(f"Performance comparison written to: {comparison_file}")
print("Temperature parameter study aggregation complete!")

Generating final comprehensive summary...
Final comprehensive summary written to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/final_temperature_study_summary.json
CSV summary available at: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/temperature_study_summary.csv
Performance comparison written to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/temperature_performance_comparison.txt
Temperature parameter study aggregation complete!


In [None]:
print('All leaderboard plots and CSVs saved in:', plots_dir)    
print(f"Saved plot for {lb} to: {png_path}")    
plt.close()    
plt.savefig(png_path)    
png_path = plots_dir / f"{safe_lb}_by_temperature.png"    
plt.tight_layout()    
plt.grid(True, linestyle='--', alpha=0.4)    
plt.title(f'Leaderboard: {lb} — summary_metrics by Temperature')    
plt.ylim(-0.05, 1.05)  # keep within [0,1] with slight padding    
plt.ylabel('summary_metrics (accuracy)')    
plt.xlabel('Temperature')   
plt.plot(series.index, series.values, marker='o', linestyle='-', label=lb)    # plot line with circle markers; missing points will show gaps (NaN)    
plt.figure(figsize=(8, 5))    
series.to_csv(csv_path, header=['summary_metrics'], index_label='temperature')    
csv_path = plots_dir / f"{safe_lb}_by_temperature.csv"    
safe_lb = re.sub(r'[^A-Za-z0-9._-]', '_', lb)    # Prepare filename-safe leaderboard name    
series = df[lb].astype(float) 
for lb in leaderboards:


In [21]:
import pandas as pd
import matplotlib
import math
import matplotlib.pyplot as plt
matplotlib.use('Agg')  # headless backend for PNG saving

# --- New: per-leaderboard plots (temperature vs summary_metrics) ---

# Build a DataFrame where rows are temperatures and columns are leaderboards, values are summary_metrics
temps = sorted([r['temperature'] for r in all_results])
leaderboards = sorted(list(final_summary['leaderboards_tested']))
df = pd.DataFrame(index=temps, columns=leaderboards, dtype=float)

for t in temps:
    key = str(t)
    data = final_summary['results_by_temperature'].get(key, {})

    for lb in leaderboards:
        try:
            val = None
            if 'results' in data and lb in data['results']:
                res = data['results'][lb]
                # Try to extract the accuracy-like metric from 'summary_metrics' field
                if isinstance(res, dict) and 'summary_metrics' in res:
                    sm = res['summary_metrics']
                    # summary_metrics expected to be a dict or a numeric value
                    if isinstance(sm, dict):
                        for k in sm:
                            if 'test_score' in k.lower() or 'accuracy' in k.lower() or 'pass_rate' in k.lower():
                                val = float(sm[k])
                                break
                    elif isinstance(sm, (int, float)):
                        val = float(sm)
                elif isinstance(res, (int, float)):
                    val = float(res)
            
            if val is None:
                val = float('nan')# assign NaN when missing

            df.at[t, lb] = float(val) 
        except Exception as e:
            print(f"Error extracting metric for {lb} at temp {t}: {e}")
            df.at[t, lb] = float('nan')

# Save the combined CSV for convenience
combined_csv = f"{results_summary_dir}/temperature_leaderboard_summary.csv" 
df.to_csv(combined_csv)
print(f"Saved combined leaderboard CSV to: {combined_csv}")


# Create per-leaderboard plots and CSVs 
plots_dir = Path(results_summary_dir) / 'leaderboard_plots'
plots_dir.mkdir(parents=True, exist_ok=True)

for lb in leaderboards:
    series = df[lb].astype(float) 
    safe_lb = re.sub(r'[^A-Za-z0-9._-]', '_', lb)    # Prepare filename-safe leaderboard name    
    csv_path = plots_dir / f"{safe_lb}_by_temperature.csv"    
    png_path = plots_dir / f"{safe_lb}_by_temperature.png"    
    plt.figure(figsize=(8, 5))    
    plt.plot(series.index, series.values, marker='o', linestyle='-', label=lb)    # plot line with circle markers; missing points will show gaps (NaN)    
    plt.title(f'Leaderboard: {lb} — summary_metrics by Temperature') 
    if 'hitab' in lb.lower():   
        plt.ylim(0.4, 0.9)  # keep within [0,1] with slight padding    
        plt.xlim(0.2, 2.1)
    plt.ylabel('summary_metrics (accuracy)')    
    plt.xlabel('Temperature')   
    plt.grid(True, linestyle='--', alpha=0.4)    
    plt.tight_layout()    
    plt.savefig(png_path)    
    plt.close()    
    series.to_csv(csv_path, header=['summary_metrics'], index_label='temperature')    
    print(f"Saved plot for {lb} to: {png_path}")

Saved combined leaderboard CSV to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/temperature_leaderboard_summary.csv
Saved plot for aime to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/leaderboard_plots/aime_by_temperature.png
Saved plot for gpqa to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/leaderboard_plots/gpqa_by_temperature.png
Saved plot for aime to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/leaderboard_plots/aime_by_temperature.png
Saved plot for gpqa to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/leaderboard_plots/gpqa_by_temperature.png
Saved plot for hitab to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/leaderboard_plots/hitab_by_temperature.png
Saved plot for math to: /lustrefs/users/taylor.killian/Reasonin