In [2]:
import json
import os
import glob
import re
import sys
from pathlib import Path

First, aggregate all results by temperature setting. This will loop through all temperatures and pull out the `*_results.json` files to then aggregate into a shared dictionary.

In [15]:
results_summary_dir = "/lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary"
model_name = {"am": "checkpoint_0002250", "ot": "checkpoint_0006300"}
temperatures = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]

# Define the two prefixes to process
prefixes = ["am", "ot"]


for temperature in temperatures:
    temp_for_folder = str(temperature).replace('.', '_')
    temp_results_file = f"{results_summary_dir}/results_temp_{temp_for_folder}.json"
    
    # Read the empty results file
    with open(temp_results_file, 'r') as f:
        results = json.load(f)
    
    # Ensure the results structure has separate sections for AM and OT
    if 'results' not in results:
        results['results'] = {}
    if 'am_results' not in results:
        results['am_results'] = {}
    if 'ot_results' not in results:
        results['ot_results'] = {}
    
    # Process both AM and OT results
    for prefix in prefixes:
        save_folder = f"/lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/{prefix}_offline_output_temp_{temp_for_folder}"
        
        print(f"Looking for {prefix.upper()} results in: {save_folder}/{model_name[prefix]}")
        
        # Find all result files
        result_files = glob.glob(f"{save_folder}/{model_name[prefix]}/*_results.json")
        if not result_files:
            # Try alternative patterns
            result_files = glob.glob(f"{save_folder}/{model_name[prefix]}/*.json")
            result_files = [f for f in result_files if 'results' in f or 'eval' in f]
        
        print(f"Found {len(result_files)} result files for {prefix.upper()} at temperature {temperature}")
        
        # Determine which results field to use
        results_field = f'{prefix}_results'
        
        # Now put all benchmark results files into the overall results dictionary for the temperature
        for result_file in result_files:
            try:
                # Extract leaderboard name from filename
                filename = os.path.basename(result_file)
                
                # Try to extract leaderboard name from various filename patterns
                leaderboard_match = re.search(r'(?:math|codegen|logic|table|simulation|stem|ood)__([^_]+)', filename)
                if leaderboard_match:
                    leaderboard = leaderboard_match.group(1)
                else:
                    # Fallback: use filename without extension
                    leaderboard = os.path.splitext(filename)[0]
                
                print(f"Processing {prefix.upper()} {leaderboard} results from {result_file} for temperature {temperature}")
                
                with open(result_file, 'r') as f:
                    eval_results = json.load(f)
                
                # Extract key metrics - adapt based on your actual result structure
                metrics = {}
                if isinstance(eval_results, dict):
                    # Common metric names to look for
                    metric_keys = ['accuracy', 'score', 'pass_rate', 'success_rate', 'avg_score', 'mean_score']
                    for key in eval_results:
                        if any(metric in key.lower() for metric in metric_keys):
                            metrics[key] = eval_results[key]
                        elif key in ['total_samples', 'num_correct', 'num_total']:
                            metrics[key] = eval_results[key]
                    
                    # If we couldn't find standard metrics, store the whole result
                    if not metrics and eval_results:
                        metrics = eval_results
                
                results[results_field][leaderboard] = metrics
                
            except Exception as e:
                print(f"Error processing {result_file}: {e}")
                results[results_field][leaderboard] = {"error": str(e)}
    
    # Write updated results
    with open(temp_results_file, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"Results aggregated for temperature {temperature} (both AM and OT)")

Looking for AM results in: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/am_offline_output_temp_0_3/checkpoint_0002250
Found 6 result files for AM at temperature 0.3
Processing AM aime results from /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/am_offline_output_temp_0_3/checkpoint_0002250/math__aime_repeated_8x_240_eval_results.json for temperature 0.3
Processing AM hitab results from /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/am_offline_output_temp_0_3/checkpoint_0002250/table__hitab_1k_eval_results.json for temperature 0.3
Processing AM mbpp results from /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/am_offline_output_temp_0_3/checkpoint_0002250/codegen__mbpp_500_eval_results.json for temperature 0.3
Processing AM zebra results from /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/am_offline_output_temp_0_3/checkpoint_0002250/logic__zebra_puzzle_dataset_200_eval

Processing OT hitab results from /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/ot_offline_output_temp_0_8/checkpoint_0006300/table__hitab_1k_eval_results.json for temperature 0.8
Processing OT math results from /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/ot_offline_output_temp_0_8/checkpoint_0006300/math__math_500_eval_results.json for temperature 0.8
Processing OT mbpp results from /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/ot_offline_output_temp_0_8/checkpoint_0006300/codegen__mbpp_500_eval_results.json for temperature 0.8
Processing OT aime results from /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/ot_offline_output_temp_0_8/checkpoint_0006300/math__aime_repeated_8x_240_eval_results.json for temperature 0.8
Processing OT zebra results from /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/lng131k/ot_offline_output_temp_0_8/checkpoint_0006300/logic__zebra_puzzle_datas

Now, let's pull the fields we want from the aggregated, per temperature, results files to create a full summary file.

In [16]:
summary_csv = f"{results_summary_dir}/temperature_study_summary.csv"
print("Generating final comprehensive summary...")

# Collect all individual temperature results
all_results = []
result_files = glob.glob(f"{results_summary_dir}/results_temp_*.json")

for result_file in result_files:
    try:
        with open(result_file, 'r') as f:
            results = json.load(f)
        all_results.append(results)
    except Exception as e:
        print(f"Error reading {result_file}: {e}")

# Sort by temperature
all_results.sort(key=lambda x: x['temperature'])

# Collect leaderboards from both AM and OT results
all_am_leaderboards = list(set().union(*[list(r.get('am_results', {}).keys()) for r in all_results]))
all_ot_leaderboards = list(set().union(*[list(r.get('ot_results', {}).keys()) for r in all_results]))

# Create comprehensive summary
final_summary = {
    "study_type": "temperature_parameter_study",
    "total_temperatures": len(all_results),
    "temperature_range": [min(r['temperature'] for r in all_results), 
                          max(r['temperature'] for r in all_results)],
    "model_info": {
        "model_path": all_results[0]['model_path'] if all_results else "",
        "model_name": all_results[0]['model_name'] if all_results else ""
    },
    "leaderboards_tested": {
        "am": all_am_leaderboards,
        "ot": all_ot_leaderboards
    },
    "results_by_temperature": {str(r['temperature']): r for r in all_results}
}

# Write final summary
final_summary_file = f"{results_summary_dir}/final_temperature_study_summary.json"
with open(final_summary_file, 'w') as f:
    json.dump(final_summary, f, indent=2)

print(f"Final comprehensive summary written to: {final_summary_file}")
print(f"CSV summary available at: {summary_csv}")

# Generate a simple performance comparison
comparison_file = f"{results_summary_dir}/temperature_performance_comparison.txt"
with open(comparison_file, 'w') as f:
    f.write("TEMPERATURE PARAMETER STUDY PERFORMANCE COMPARISON\n")
    f.write("="*60 + "\n\n")
    
    # AM Results Section
    f.write("AM RESULTS\n")
    f.write("="*60 + "\n\n")
    for leaderboard in all_am_leaderboards:
        f.write(f"Leaderboard: {leaderboard}\n")
        f.write("-" * 30 + "\n")
        for temp_str, temp_data in final_summary['results_by_temperature'].items():
            if 'am_results' in temp_data and leaderboard in temp_data['am_results']:
                metrics = temp_data['am_results'][leaderboard]
                f.write(f"Temperature {temp_str}: {metrics}\n")
        f.write("\n")
    
    # OT Results Section
    f.write("\n" + "="*60 + "\n")
    f.write("OT RESULTS\n")
    f.write("="*60 + "\n\n")
    for leaderboard in all_ot_leaderboards:
        f.write(f"Leaderboard: {leaderboard}\n")
        f.write("-" * 30 + "\n")
        for temp_str, temp_data in final_summary['results_by_temperature'].items():
            if 'ot_results' in temp_data and leaderboard in temp_data['ot_results']:
                metrics = temp_data['ot_results'][leaderboard]
                f.write(f"Temperature {temp_str}: {metrics}\n")
        f.write("\n")

print(f"Performance comparison written to: {comparison_file}")
print("Temperature parameter study aggregation complete!")

Generating final comprehensive summary...
Final comprehensive summary written to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/final_temperature_study_summary.json
CSV summary available at: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/temperature_study_summary.csv
Performance comparison written to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/temperature_performance_comparison.txt
Temperature parameter study aggregation complete!


In [19]:
import pandas as pd
import matplotlib
import math
import matplotlib.pyplot as plt
matplotlib.use('Agg')  # headless backend for PNG saving

# --- New: per-leaderboard plots (temperature vs summary_metrics) ---
# Build separate DataFrames for AM and OT results
temps = sorted([r['temperature'] for r in all_results])
am_leaderboards = sorted(list(final_summary['leaderboards_tested']['am']))
ot_leaderboards = sorted(list(final_summary['leaderboards_tested']['ot']))
all_leaderboards = sorted(list(set(am_leaderboards + ot_leaderboards)))

# Create DataFrames for AM and OT
df_am = pd.DataFrame(index=temps, columns=all_leaderboards, dtype=float)
df_ot = pd.DataFrame(index=temps, columns=all_leaderboards, dtype=float)

# Helper function to extract metric value
def extract_metric(res):
    val = None
    if isinstance(res, dict) and 'summary_metrics' in res:
        sm = res['summary_metrics']
        if isinstance(sm, dict):
            for k in sm:
                if 'test_score' in k.lower() or 'accuracy' in k.lower() or 'pass_rate' in k.lower():
                    val = float(sm[k])
                    break
        elif isinstance(sm, (int, float)):
            val = float(sm)
    elif isinstance(res, (int, float)):
        val = float(res)
    return val if val is not None else float('nan')

# Fill AM DataFrame
for t in temps:
    key = str(t)
    data = final_summary['results_by_temperature'].get(key, {})
    for lb in all_leaderboards:
        try:
            if 'am_results' in data and lb in data['am_results']:
                res = data['am_results'][lb]
                df_am.at[t, lb] = extract_metric(res)
            else:
                df_am.at[t, lb] = float('nan')
        except Exception as e:
            print(f"Error extracting AM metric for {lb} at temp {t}: {e}")
            df_am.at[t, lb] = float('nan')

# Fill OT DataFrame
for t in temps:
    key = str(t)
    data = final_summary['results_by_temperature'].get(key, {})
    for lb in all_leaderboards:
        try:
            if 'ot_results' in data and lb in data['ot_results']:
                res = data['ot_results'][lb]
                df_ot.at[t, lb] = extract_metric(res)
            else:
                df_ot.at[t, lb] = float('nan')
        except Exception as e:
            print(f"Error extracting OT metric for {lb} at temp {t}: {e}")
            df_ot.at[t, lb] = float('nan')

# Save the combined CSVs for convenience
am_csv = f"{results_summary_dir}/temperature_leaderboard_summary_AM.csv"
ot_csv = f"{results_summary_dir}/temperature_leaderboard_summary_OT.csv"
df_am.to_csv(am_csv)
df_ot.to_csv(ot_csv)
print(f"Saved AM leaderboard CSV to: {am_csv}")
print(f"Saved OT leaderboard CSV to: {ot_csv}")

# Create per-leaderboard plots and CSVs
plots_dir = Path(results_summary_dir) / 'leaderboard_plots'
plots_dir.mkdir(parents=True, exist_ok=True)

for lb in all_leaderboards:
    am_series = df_am[lb].astype(float)
    ot_series = df_ot[lb].astype(float)
    
    # Check if we have any data for this leaderboard
    has_am_data = not am_series.isna().all()
    has_ot_data = not ot_series.isna().all()
    
    if not has_am_data and not has_ot_data:
        print(f"Skipping {lb} - no data available")
        continue
    
    safe_lb = re.sub(r'[^A-Za-z0-9._-]', '_', lb)  # Prepare filename-safe leaderboard name
    csv_path_am = plots_dir / f"{safe_lb}_AM_by_temperature.csv"
    csv_path_ot = plots_dir / f"{safe_lb}_OT_by_temperature.csv"
    png_path = plots_dir / f"{safe_lb}_by_temperature.png"
    
    # Create the plot
    plt.figure(figsize=(10, 6))
    
    # Plot AM results in blue
    if has_am_data:
        plt.plot(am_series.index, am_series.values, marker='o', linestyle='-', 
                 color='blue', linewidth=2, markersize=6, label='AM-Thinking')
    
    # Plot OT results in orange
    if has_ot_data:
        plt.plot(ot_series.index, ot_series.values, marker='s', linestyle='-', 
                 color='orange', linewidth=2, markersize=6, label='OpenThoughts')
    
    plt.title(f'Leaderboard: {lb} — Summary Metrics by Temperature', fontsize=14, fontweight='bold')
    
    # Adjust y-axis limits based on leaderboard
    if 'hitab' in lb.lower():
        plt.ylim(0.4, 0.9)
    
    plt.xlim(0.2, 2.1)
    plt.ylabel('Summary Metrics (Accuracy)', fontsize=12)
    plt.xlabel('Temperature', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.4)
    plt.legend(loc='best', fontsize=11, framealpha=0.9)
    plt.tight_layout()
    plt.savefig(png_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    # Save individual CSVs
    if has_am_data:
        am_series.to_csv(csv_path_am, header=['summary_metrics'], index_label='temperature')
    if has_ot_data:
        ot_series.to_csv(csv_path_ot, header=['summary_metrics'], index_label='temperature')
    
    print(f"Saved plot for {lb} to: {png_path}")

print(f"All leaderboard plots saved to: {plots_dir}")

Saved AM leaderboard CSV to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/temperature_leaderboard_summary_AM.csv
Saved OT leaderboard CSV to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/temperature_leaderboard_summary_OT.csv


Saved plot for aime to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/leaderboard_plots/aime_by_temperature.png
Saved plot for gpqa to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/leaderboard_plots/gpqa_by_temperature.png
Saved plot for hitab to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/leaderboard_plots/hitab_by_temperature.png
Saved plot for math to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/leaderboard_plots/math_by_temperature.png
Saved plot for mbpp to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/leaderboard_plots/mbpp_by_temperature.png
Saved plot for zebra to: /lustrefs/users/taylor.killian/Reasoning360/evaluation_results/temperature_study_summary/leaderboard_plots/zebra_by_temperature.png
All leaderboard plots saved to: /lustrefs/users/taylor.killian/Rea

In [6]:
ot_series

0.3   NaN
0.4   NaN
0.5   NaN
0.6   NaN
0.7   NaN
0.8   NaN
0.9   NaN
1.0   NaN
1.1   NaN
1.2   NaN
1.3   NaN
1.4   NaN
1.5   NaN
1.6   NaN
1.7   NaN
1.8   NaN
1.9   NaN
2.0   NaN
Name: zebra, dtype: float64