## Dependnecies

In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
import numpy as np

## Data Loading

In [14]:
def load_experiment_data(base_dir):
    base_path = Path(base_dir)
    data = {}
    
    # Iterate through prompt templates
    for prompt_dir in base_path.iterdir():
        if not prompt_dir.is_dir():
            continue
            
        prompt_name = prompt_dir.name
        data[prompt_name] = {}
        
        # Iterate through datasets
        for dataset_dir in prompt_dir.iterdir():
            if not dataset_dir.is_dir():
                continue
                
            dataset_name = dataset_dir.name
            data[prompt_name][dataset_name] = {}
            
            # Iterate through runs
            for run_dir in dataset_dir.iterdir():
                if not run_dir.is_dir():
                    continue
                    
                run_id = run_dir.name
                data[prompt_name][dataset_name][run_id] = {}
                
                # Load detailed_results.json
                detailed_results_path = run_dir / "detailed_results.json"
                if detailed_results_path.exists():
                    with open(detailed_results_path, 'r') as f:
                        data[prompt_name][dataset_name][run_id]['detailed_results'] = json.load(f)
                
                # Load energy.csv
                energy_path = run_dir / "energy.csv"
                if energy_path.exists():
                    data[prompt_name][dataset_name][run_id]['energy_df'] = pd.read_csv(energy_path)
    
    return data

In [15]:
# Example usage
experiment_data = load_experiment_data("..\\outputs\\run_20260225_115145\\bigcode_starcoder2-3b")

In [16]:
def flatten_experiment_data(experiment_data):
    """
    Flatten nested experiment data into a single DataFrame.
    
    Args:
        experiment_data: Nested dictionary from load_experiment_data()
    
    Returns:
        DataFrame where:
        - Each row represents one run (prompt_template/dataset/run_id combination)
        - Columns include metadata (prompt, dataset, run_id) 
        - Energy columns contain arrays (one per CSV column)
        - Detailed results columns contain flattened lists extracted from all tasks
    """
    rows = []
    
    for prompt_name, prompt_data in experiment_data.items():
        for dataset_name, dataset_data in prompt_data.items():
            for run_id, run_data in dataset_data.items():
                row = {
                    'prompt_template': prompt_name,
                    'dataset': dataset_name,
                    'run_id': run_id
                }
                
                # Process energy data - store each column as an array
                if 'energy_df' in run_data:
                    energy_df = run_data['energy_df']
                    for col in energy_df.columns:
                        row[f'energy_{col}'] = energy_df[col].values
                
                # Process detailed results - flatten list of dicts
                if 'detailed_results' in run_data:
                    detailed_results = run_data['detailed_results']
                    
                    if len(detailed_results) > 0:
                        # Get all keys from the first task (assuming uniform structure)
                        sample_task = detailed_results[0]
                        
                        # For each key in the task dict, collect values across all tasks
                        for key in sample_task.keys():
                            # Handle nested metrics dict separately
                            if key == 'metrics' and isinstance(sample_task[key], dict):
                                # Flatten metrics sub-keys
                                for metric_key in sample_task[key].keys():
                                    row[f'results_metrics_{metric_key}'] = [
                                        task['metrics'].get(metric_key) 
                                        for task in detailed_results
                                    ]
                            else:
                                # Regular key - collect all values as list
                                row[f'results_{key}'] = [
                                    task.get(key) 
                                    for task in detailed_results
                                ]
                
                # Add summary if exists
                if 'summary' in run_data:
                    for key, value in run_data['summary'].items():
                        row[f'summary_{key}'] = value
                
                rows.append(row)
    
    return pd.DataFrame(rows)

In [17]:
# Create flattened DataFrame
df = flatten_experiment_data(experiment_data)

# Display basic info
print(f"DataFrame shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df

DataFrame shape: (12, 96)

Columns: ['prompt_template', 'dataset', 'run_id', 'energy_Delta', 'energy_Time', 'energy_CORE0_ENERGY (J)', 'energy_CORE0_FREQ (MHZ)', 'energy_CORE0_PSTATE', 'energy_CORE0_VOLT (V)', 'energy_CPU_ENERGY (J)', 'energy_CPU_FREQUENCY_0', 'energy_CPU_FREQUENCY_1', 'energy_CPU_FREQUENCY_10', 'energy_CPU_FREQUENCY_11', 'energy_CPU_FREQUENCY_12', 'energy_CPU_FREQUENCY_13', 'energy_CPU_FREQUENCY_14', 'energy_CPU_FREQUENCY_15', 'energy_CPU_FREQUENCY_16', 'energy_CPU_FREQUENCY_17', 'energy_CPU_FREQUENCY_18', 'energy_CPU_FREQUENCY_19', 'energy_CPU_FREQUENCY_2', 'energy_CPU_FREQUENCY_20', 'energy_CPU_FREQUENCY_21', 'energy_CPU_FREQUENCY_22', 'energy_CPU_FREQUENCY_23', 'energy_CPU_FREQUENCY_24', 'energy_CPU_FREQUENCY_25', 'energy_CPU_FREQUENCY_26', 'energy_CPU_FREQUENCY_27', 'energy_CPU_FREQUENCY_28', 'energy_CPU_FREQUENCY_29', 'energy_CPU_FREQUENCY_3', 'energy_CPU_FREQUENCY_30', 'energy_CPU_FREQUENCY_31', 'energy_CPU_FREQUENCY_4', 'energy_CPU_FREQUENCY_5', 'energy_CPU_FRE

Unnamed: 0,prompt_template,dataset,run_id,energy_Delta,energy_Time,energy_CORE0_ENERGY (J),energy_CORE0_FREQ (MHZ),energy_CORE0_PSTATE,energy_CORE0_VOLT (V),energy_CPU_ENERGY (J),...,results_extracted_code,results_reference,results_metrics_edit_distance,results_metrics_edit_distance_normalized,results_metrics_levenshtein_ratio,results_metrics_rouge_l_precision,results_metrics_rouge_l_recall,results_metrics_rouge_l_fmeasure,results_metrics_codebleu,results_metrics_codebleu_ngram_match
0,answer_only_no_expl,humaneval_custom,r1,"[0, 200, 199, 200, 200, 199, 200, 200, 200, 20...","[1772027861046, 1772027861046, 1772027861247, ...","[106485.80258178712, 77665.18424987793, 106487...","[5450.0, 5450.0, 5125.0, 2311.111111111111, 52...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.3125, 0.1812499999999999, 0.368749999999999...","[752708.6723480225, 752709.2333526611, 752717....",...,"[, , , def hello_world():\n """"""\n >>> he...","[ for idx, elem in enumerate(numbers):\n ...","[252, 419, 24, 168, 98, 192, 1147, 185, 124, 2...","[1.0, 1.0, 1.0, 0.7887323943661971, 0.84482758...","[0.0, 0.0, 0.0, 0.31976744186046513, 0.3041474...","[0, 0, 0, 0.038461538461538464, 0.117647058823...","[0, 0, 0, 0.06666666666666667, 0.125, 0, 0.116...","[0, 0, 0, 0.04878048780487805, 0.1212121212121...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.047619047619047616, 0.0, 0.0..."
1,answer_only_no_expl,humaneval_custom,r2,"[0, 200, 200, 199, 199, 199, 199, 200, 199, 19...","[1772047370021, 1772047370021, 1772047370221, ...","[238117.0500640869, 238117.12771606445, 155487...","[5450.0, 5450.0, 5375.0, 5150.0, 5225.0, 5250....","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.1937499999999998, 0.1937499999999998, 0.274...","[1421564.264312744, 1421565.0434570312, 142157...",...,"[, , , def hello_world():\n """"""\n >>> he...","[ for idx, elem in enumerate(numbers):\n ...","[252, 419, 24, 168, 98, 192, 1147, 185, 124, 2...","[1.0, 1.0, 1.0, 0.7887323943661971, 0.84482758...","[0.0, 0.0, 0.0, 0.31976744186046513, 0.3041474...","[0, 0, 0, 0.038461538461538464, 0.117647058823...","[0, 0, 0, 0.06666666666666667, 0.125, 0, 0.116...","[0, 0, 0, 0.04878048780487805, 0.1212121212121...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.047619047619047616, 0.0, 0.0..."
2,answer_only_no_expl,humaneval_custom,r3,"[0, 200, 200, 200, 199, 199, 199, 199, 199, 20...","[1772033426056, 1772033426056, 1772033426257, ...","[145054.76817321777, 98108.92973327637, 98109....","[5450.0, 5400.0, 5100.0, 5250.0, 5225.0, 5225....","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0.1875, 0.2375, 0.3562499999999999, 0.33125, ...","[943715.4232940674, 943716.6347045898, 943724....",...,"[, , , def hello_world():\n """"""\n >>> he...","[ for idx, elem in enumerate(numbers):\n ...","[252, 419, 24, 168, 98, 192, 1147, 185, 124, 2...","[1.0, 1.0, 1.0, 0.7887323943661971, 0.84482758...","[0.0, 0.0, 0.0, 0.31976744186046513, 0.3041474...","[0, 0, 0, 0.038461538461538464, 0.117647058823...","[0, 0, 0, 0.06666666666666667, 0.125, 0, 0.116...","[0, 0, 0, 0.04878048780487805, 0.1212121212121...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.047619047619047616, 0.0, 0.0..."
3,baseline_single_shot,humaneval_custom,r1,"[0, 199, 199, 200, 200, 200, 199, 200, 200, 20...","[1772041774021, 1772041774021, 1772041774221, ...","[203535.9206085205, 129099.17764282228, 203537...","[5450.0, 5450.0, 5375.0, 5100.0, 5300.0, 5225....","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ...","[0.1812499999999999, 0.1875, 0.243749999999999...","[1230367.723373413, 1230368.5049743652, 123037...",...,"[def has_close_elements(numbers: List[float], ...","[ for idx, elem in enumerate(numbers):\n ...","[345, 396, 299, 768, 350, 273, 649, 308, 265, ...","[0.6872509960159362, 0.7557251908396947, 0.934...","[0.46153846153846156, 0.37751855779427357, 0.1...","[0.1566265060240964, 0.057971014492753624, 0.0...","[0.5, 0.09302325581395349, 0.5, 0.4, 0.25, 0.3...","[0.23853211009174313, 0.07142857142857142, 0.0...","[0.08594494135673132, 0.0, 0.0, 0.0, 0.0, 0.07...","[0.20634920634920634, 0.02631578947368421, 0.0..."
4,baseline_single_shot,humaneval_custom,r2,"[0, 199, 199, 199, 199, 199, 199, 199, 200, 20...","[1772044568861, 1772044568861, 1772044569061, ...","[220602.5231781006, 142705.16804504397, 142706...","[5450.0, 5450.0, 5275.0, 5175.0, 5400.0, 5100....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0.1875, 0.1999999999999999, 0.287499999999999...","[1325904.9857177734, 1325905.8044586182, 13259...",...,"[def has_close_elements(numbers: List[float], ...","[ for idx, elem in enumerate(numbers):\n ...","[345, 396, 299, 768, 350, 273, 649, 308, 265, ...","[0.6872509960159362, 0.7557251908396947, 0.934...","[0.46153846153846156, 0.37751855779427357, 0.1...","[0.1566265060240964, 0.057971014492753624, 0.0...","[0.5, 0.09302325581395349, 0.5, 0.4, 0.25, 0.3...","[0.23853211009174313, 0.07142857142857142, 0.0...","[0.08594494135673132, 0.0, 0.0, 0.0, 0.0, 0.07...","[0.20634920634920634, 0.02631578947368421, 0.0..."
5,baseline_single_shot,humaneval_custom,r3,"[0, 200, 199, 199, 199, 199, 200, 199, 200, 19...","[1772022307384, 1772022307384, 1772022307584, ...","[67694.29295349121, 57039.04733276367, 30479.8...","[5450.0, 5325.0, 5125.0, 2300.0, 5225.0, 5250....","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.20625, 0.3062499999999999, 0.38125000000000...","[563109.3822784424, 563110.0404815674, 563118....",...,"[def has_close_elements(numbers: List[float], ...","[ for idx, elem in enumerate(numbers):\n ...","[345, 396, 299, 768, 350, 273, 649, 308, 265, ...","[0.6872509960159362, 0.7557251908396947, 0.934...","[0.46153846153846156, 0.37751855779427357, 0.1...","[0.1566265060240964, 0.057971014492753624, 0.0...","[0.5, 0.09302325581395349, 0.5, 0.4, 0.25, 0.3...","[0.23853211009174313, 0.07142857142857142, 0.0...","[0.08594494135673132, 0.0, 0.0, 0.0, 0.0, 0.07...","[0.20634920634920634, 0.02631578947368421, 0.0..."
6,polite_single_shot,humaneval_custom,r1,"[0, 200, 199, 199, 200, 199, 199, 200, 199, 19...","[1772036208783, 1772036208783, 1772036208984, ...","[164472.97360229492, 164473.11978149414, 16447...","[5450.0, 5375.0, 5375.0, 5250.0, 5300.0, 5250....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.1999999999999999, 0.25, 0.2562499999999998,...","[1039424.2096405028, 1039425.7497558594, 10394...",...,"[, , , def below_zero(operations: List[int]) -...","[ for idx, elem in enumerate(numbers):\n ...","[252, 419, 24, 1118, 101, 192, 853, 50, 140, 2...","[1.0, 1.0, 1.0, 0.8972712680577849, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.18736383442265792, 0.0, 0.0,...","[0, 0, 0, 0.06310679611650485, 0, 0, 0.0873786...","[0, 0, 0, 0.8666666666666667, 0, 0, 0.20930232...","[0, 0, 0, 0.11764705882352941, 0, 0, 0.1232876...","[0.0, 0.0, 0.0, 0.051211670999810194, 0.0, 0.0...","[0.0, 0.0, 0.0, 0.07407407407407407, 0.0, 0.0,..."
7,polite_single_shot,humaneval_custom,r2,"[0, 200, 199, 199, 199, 200, 200, 200, 200, 19...","[1772038990750, 1772038990750, 1772038990950, ...","[184011.3512878418, 184011.527633667, 184012.3...","[5450.0, 5425.0, 5175.0, 5225.0, 5150.0, 5225....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.15625, 0.2249999999999998, 0.33749999999999...","[1135230.1230773926, 1135231.378158569, 113523...",...,"[, , , def below_zero(operations: List[int]) -...","[ for idx, elem in enumerate(numbers):\n ...","[252, 419, 24, 1118, 101, 192, 853, 50, 140, 2...","[1.0, 1.0, 1.0, 0.8972712680577849, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.18736383442265792, 0.0, 0.0,...","[0, 0, 0, 0.06310679611650485, 0, 0, 0.0873786...","[0, 0, 0, 0.8666666666666667, 0, 0, 0.20930232...","[0, 0, 0, 0.11764705882352941, 0, 0, 0.1232876...","[0.0, 0.0, 0.0, 0.051211670999810194, 0.0, 0.0...","[0.0, 0.0, 0.0, 0.07407407407407407, 0.0, 0.0,..."
8,polite_single_shot,humaneval_custom,r3,"[0, 200, 200, 200, 200, 199, 200, 199, 199, 19...","[1772019512917, 1772019512917, 1772019513117, ...","[51634.569915771484, 48148.05140686035, 51635....","[5400.0, 5275.0, 5325.0, 5000.0, 5375.0, 5375....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.25, 0.2874999999999998, 0.28125, 0.41250000...","[471062.4437713623, 471063.86393737793, 471071...",...,"[, , , def below_zero(operations: List[int]) -...","[ for idx, elem in enumerate(numbers):\n ...","[252, 419, 24, 1118, 101, 192, 853, 50, 140, 2...","[1.0, 1.0, 1.0, 0.8972712680577849, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.18736383442265792, 0.0, 0.0,...","[0, 0, 0, 0.06310679611650485, 0, 0, 0.0873786...","[0, 0, 0, 0.8666666666666667, 0, 0, 0.20930232...","[0, 0, 0, 0.11764705882352941, 0, 0, 0.1232876...","[0.0, 0.0, 0.0, 0.051211670999810194, 0.0, 0.0...","[0.0, 0.0, 0.0, 0.07407407407407407, 0.0, 0.0,..."
9,think_step_by_step,humaneval_custom,r1,"[0, 200, 200, 199, 199, 200, 200, 200, 200, 19...","[1772030641944, 1772030641944, 1772030642144, ...","[125805.66015625, 87635.07162475586, 125806.69...","[5450.0, 5350.0, 5100.0, 5200.0, 5225.0, 5175....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.1875, 0.2249999999999998, 0.39375, 0.349999...","[848597.1455535889, 848597.7242889404, 848605....",...,"[, , , def below_zero(operations: List[int]) -...","[ for idx, elem in enumerate(numbers):\n ...","[252, 419, 24, 439, 384, 192, 300, 1511, 162, ...","[1.0, 1.0, 1.0, 0.7783687943262412, 0.83842794...","[0.0, 0.0, 0.0, 0.36834532374100715, 0.2683363...","[0, 0, 0, 0.1566265060240964, 0.08955223880597...","[0, 0, 0, 0.8666666666666667, 0.375, 0, 0.0, 0...","[0, 0, 0, 0.2653061224489796, 0.14457831325301...","[0.0, 0.0, 0.0, 0.13479494507318684, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.1927710843373494, 0.1, 0.0, ..."


In [18]:
def find_sustained_gpu_usage_start(gpu_usage_array, threshold=75, min_consecutive=20):
    """
    Find the first index where GPU usage is sustained above threshold for min_consecutive measurements.
    
    Args:
        gpu_usage_array: Array of GPU usage values
        threshold: GPU usage threshold percentage (default: 75)
        min_consecutive: Minimum number of consecutive measurements above threshold (default: 20)
    
    Returns:
        Index where sustained usage begins, or None if not found
    """
    if len(gpu_usage_array) < min_consecutive:
        return None
    
    consecutive_count = 0
    start_idx = None
    
    for i, usage in enumerate(gpu_usage_array):
        if usage > threshold:
            if consecutive_count == 0:
                start_idx = i
            consecutive_count += 1
            
            if consecutive_count >= min_consecutive:
                return start_idx
        else:
            consecutive_count = 0
            start_idx = None
    
    return None


def truncate_energy_measurements(df, threshold=75, min_consecutive=20, truncate_end=True):
    """
    Truncate energy measurements for all runs at start and optionally at end based on GPU usage.
    
    Args:
        df: DataFrame from flatten_experiment_data()
        threshold: GPU usage threshold percentage (default: 75)
        min_consecutive: Minimum number of consecutive measurements above/below threshold (default: 20)
        truncate_end: Whether to also truncate at the end when GPU usage drops (default: True)
    
    Returns:
        Tuple of (truncated_df, truncation_log)
        - truncated_df: DataFrame with truncated energy arrays
        - truncation_log: DataFrame with truncation statistics per run
    """
    df_truncated = df.copy()
    truncation_log = []
    
    # Get all energy column names
    energy_cols = [col for col in df.columns if col.startswith('energy_')]
    
    for idx, row in df_truncated.iterrows():
        if 'energy_GPU0_USAGE' not in row or row['energy_GPU0_USAGE'] is None:
            truncation_log.append({
                'prompt_template': row['prompt_template'],
                'dataset': row['dataset'],
                'run_id': row['run_id'],
                'truncated_start_count': 0,
                'truncated_end_count': 0,
                'truncated_total_count': 0,
                'original_length': 0,
                'remaining_length': 0,
                'start_truncation_index': None,
                'end_truncation_index': None,
                'status': 'No GPU usage data'
            })
            continue
        
        gpu_usage = row['energy_GPU0_USAGE']
        original_length = len(gpu_usage)
        
        # Find start truncation point
        start_idx = find_sustained_gpu_usage_start(gpu_usage, threshold, min_consecutive)
        
        # Find end truncation point by reversing the array
        end_idx = None
        if truncate_end:
            reversed_start = find_sustained_gpu_usage_start(gpu_usage[::-1], threshold, min_consecutive)
            if reversed_start is not None:
                # Convert reversed index back to original array index
                end_idx = original_length - reversed_start
        
        if start_idx is None and end_idx is None:
            truncation_log.append({
                'prompt_template': row['prompt_template'],
                'dataset': row['dataset'],
                'run_id': row['run_id'],
                'truncated_start_count': 0,
                'truncated_end_count': 0,
                'truncated_total_count': 0,
                'original_length': original_length,
                'remaining_length': original_length,
                'start_truncation_index': None,
                'end_truncation_index': None,
                'status': 'No sustained GPU usage found'
            })
            continue
        
        # Apply truncation
        truncated_start = start_idx if start_idx is not None else 0
        truncated_end = end_idx if end_idx is not None else original_length
        
        # Truncate all energy columns
        for col in energy_cols:
            if col in row and row[col] is not None:
                df_truncated.at[idx, col] = row[col][truncated_start:truncated_end]
        
        truncated_start_count = truncated_start
        truncated_end_count = original_length - truncated_end
        remaining_length = truncated_end - truncated_start
        
        truncation_log.append({
            'prompt_template': row['prompt_template'],
            'dataset': row['dataset'],
            'run_id': row['run_id'],
            'truncated_start_count': truncated_start_count,
            'truncated_end_count': truncated_end_count,
            'truncated_total_count': truncated_start_count + truncated_end_count,
            'original_length': original_length,
            'remaining_length': remaining_length,
            'start_truncation_index': start_idx,
            'end_truncation_index': end_idx,
            'status': 'Truncated'
        })
    
    truncation_log_df = pd.DataFrame(truncation_log)
    
    return df_truncated, truncation_log_df

In [19]:
# Apply truncation at both start and end
df_truncated, truncation_log = truncate_energy_measurements(df, threshold=75, min_consecutive=20, truncate_end=True)

# Display truncation statistics
print("Truncation Summary:")
print(truncation_log[['prompt_template', 'run_id', 'truncated_start_count', 'truncated_end_count', 
                       'truncated_total_count', 'original_length', 'remaining_length', 'status']])

Truncation Summary:
         prompt_template run_id  truncated_start_count  truncated_end_count  \
0    answer_only_no_expl     r1                     46                    2   
1    answer_only_no_expl     r2                     48                    2   
2    answer_only_no_expl     r3                     46                    2   
3   baseline_single_shot     r1                     48                    2   
4   baseline_single_shot     r2                     81                    3   
5   baseline_single_shot     r3                     46                    4   
6     polite_single_shot     r1                     48                    3   
7     polite_single_shot     r2                     46                    2   
8     polite_single_shot     r3                     81                    3   
9     think_step_by_step     r1                     46                    3   
10    think_step_by_step     r2                     48                    0   
11    think_step_by_step     r3 

In [20]:
# Verify truncation worked at both ends
row_idx = 4
print(f"Run: {df.iloc[row_idx]['prompt_template']} - {df.iloc[row_idx]['run_id']}")
print(f"Before truncation - energy_GPU0_USAGE length: {len(df.iloc[row_idx]['energy_GPU0_USAGE'])}")
print(f"After truncation - energy_GPU0_USAGE length: {len(df_truncated.iloc[row_idx]['energy_GPU0_USAGE'])}")

print("\n\nGPU usage before truncation (first 100 values):")
print(df.iloc[row_idx]['energy_GPU0_USAGE'][:100])

print("\n\nGPU usage after truncation (first 100 values):")
print(df_truncated.iloc[row_idx]['energy_GPU0_USAGE'][:100])

print("\n\nGPU usage before truncation (last 100 values):")
print(df.iloc[row_idx]['energy_GPU0_USAGE'][-100:])

print("\n\nGPU usage after truncation (last 100 values):")
print(df_truncated.iloc[row_idx]['energy_GPU0_USAGE'][-100:])

Run: baseline_single_shot - r2
Before truncation - energy_GPU0_USAGE length: 13372
After truncation - energy_GPU0_USAGE length: 13288


GPU usage before truncation (first 100 values):
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  9  9  9 29 29 35 35 35  0  0  0  0  0  0
  0  0  0  0  0  0 24 24 24 90 90 88 88 88 91 91 90 90 90 91 91 89 89 89
 88 88 89 89]


GPU usage after truncation (first 100 values):
[90 90 88 88 88 91 91 90 90 90 91 91 89 89 89 88 88 89 89 89 88 88 90 90
 90 92 92 91 91 91 89 89 89 89 89 89 89 90 90 90 89 89 89 89 89 90 90 91
 91 91 90 90 90 90 90 90 90 92 92 92 91 91 91 91 91 92 92 92 92 92 92 92
 92 92 92 93 93 92 92 92 87 87 89 89 89 90 90 90 90 90 91 91 90 90 90 91
 91 91 91 91]


GPU usage before truncation (last 100 values):
[92 92 91 91 91 92 92 92 92 92 92 92 92 92 92 92 92 90 90 90 90 90 90 90
 90 91 91 90 90 90 91 91 90 90 90

## Prompt Processing

In [21]:
from transformers import AutoTokenizer

# Load the tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoder2-3b")
print("Tokenizer loaded successfully!")

Loading tokenizer...
Tokenizer loaded successfully!


In [22]:
def allocate_measurements_by_tokens(df_truncated, tokenizer):
    """
    Allocate energy measurements to generations proportionally based on token counts.
    
    Args:
        df_truncated: DataFrame with truncated energy measurements
        tokenizer: Tokenizer to count tokens
    
    Returns:
        DataFrame with additional columns containing segmented energy data
    """
    df_segmented = df_truncated.copy()
    
    # Get all energy column names
    energy_cols = [col for col in df_truncated.columns if col.startswith('energy_')]
    
    # Initialize new columns
    df_segmented['cycle_count'] = 0
    df_segmented['cycle_boundaries'] = None
    df_segmented['token_counts'] = None
    
    for idx, row in df_segmented.iterrows():
        # Check if required columns exist
        if 'results_prompt' not in row or 'results_raw_response' not in row:
            df_segmented.at[idx, 'cycle_count'] = 0
            df_segmented.at[idx, 'cycle_boundaries'] = []
            df_segmented.at[idx, 'token_counts'] = []
            continue
        
        prompts = row['results_prompt']
        raw_responses = row['results_raw_response']
        
        if prompts is None or raw_responses is None:
            df_segmented.at[idx, 'cycle_count'] = 0
            df_segmented.at[idx, 'cycle_boundaries'] = []
            df_segmented.at[idx, 'token_counts'] = []
            continue
        
        # Calculate token counts for each generation
        token_counts = []
        for prompt, response in zip(prompts, raw_responses):
            # Remove prompt prefix from response
            if response.startswith(prompt):
                generated_text = response[len(prompt):]
            else:
                generated_text = response
            
            # Count tokens in generated text
            tokens = tokenizer.encode(generated_text, add_special_tokens=False)
            token_counts.append(len(tokens))
        
        # Calculate total tokens and measurements
        total_tokens = sum(token_counts)
        
        # Get total number of measurements from first energy column
        total_measurements = len(row[energy_cols[0]]) if energy_cols else 0
        
        if total_tokens == 0 or total_measurements == 0:
            df_segmented.at[idx, 'cycle_count'] = 0
            df_segmented.at[idx, 'cycle_boundaries'] = []
            df_segmented.at[idx, 'token_counts'] = []
            continue
        
        # Calculate measurements per token
        measurements_per_token = total_measurements / total_tokens
        
        # Allocate measurements proportionally to each generation
        cycle_boundaries = []
        current_idx = 0
        
        for i, token_count in enumerate(token_counts):
            # Calculate number of measurements for this generation
            if i < len(token_counts) - 1:
                # For all but last, round to nearest integer
                n_measurements = round(token_count * measurements_per_token)
            else:
                # For last generation, use remaining measurements to ensure exact allocation
                n_measurements = total_measurements - current_idx
            
            end_idx = current_idx + n_measurements
            cycle_boundaries.append((current_idx, end_idx))
            current_idx = end_idx
        
        # Store results - use loc instead of at to avoid the array issue
        df_segmented.loc[idx, 'cycle_count'] = len(cycle_boundaries)
        df_segmented.at[idx, 'cycle_boundaries'] = cycle_boundaries
        df_segmented.at[idx, 'token_counts'] = token_counts
        
        # Segment each energy column by cycles
        for col in energy_cols:
            if col in row and row[col] is not None:
                energy_data = row[col]
                segmented_data = [energy_data[start:end] for start, end in cycle_boundaries]
                # Create new column name for cycles
                cycle_col = f'{col}_cycles'
                if cycle_col not in df_segmented.columns:
                    df_segmented[cycle_col] = None
                df_segmented.at[idx, cycle_col] = segmented_data
    
    return df_segmented

In [23]:
# Apply token-based segmentation
print("Allocating measurements based on token counts...")
df_segmented = allocate_measurements_by_tokens(df_truncated, tokenizer)

# Display segmentation summary
print("\nSegmentation Summary (Token-based):")
print(df_segmented[['prompt_template', 'run_id', 'cycle_count']])

# Show detailed info for one run
row_idx = 4
print(f"\n\nDetailed info for run: {df_segmented.iloc[row_idx]['prompt_template']} - {df_segmented.iloc[row_idx]['run_id']}")
print(f"Number of tasks: {len(df_segmented.iloc[row_idx]['results_task_id'])}")
print(f"Number of cycles: {df_segmented.iloc[row_idx]['cycle_count']}")
print(f"Total measurements: {len(df_truncated.iloc[row_idx]['energy_Delta'])}")
print(f"Total tokens generated: {sum(df_segmented.iloc[row_idx]['token_counts'])}")
print(f"Measurements per token: {len(df_truncated.iloc[row_idx]['energy_Delta']) / sum(df_segmented.iloc[row_idx]['token_counts']):.2f}")

print(f"\n\nFirst 10 tasks - Token counts and measurement allocation:")
for i in range(min(10, len(df_segmented.iloc[row_idx]['token_counts']))):
    start, end = df_segmented.iloc[row_idx]['cycle_boundaries'][i]
    n_measurements = end - start
    n_tokens = df_segmented.iloc[row_idx]['token_counts'][i]
    print(f"  Task {i}: {n_tokens:4d} tokens -> {n_measurements:4d} measurements (indices [{start:5d}, {end:5d}))")

Allocating measurements based on token counts...

Segmentation Summary (Token-based):
         prompt_template run_id  cycle_count
0    answer_only_no_expl     r1          164
1    answer_only_no_expl     r2          164
2    answer_only_no_expl     r3          164
3   baseline_single_shot     r1          164
4   baseline_single_shot     r2          164
5   baseline_single_shot     r3          164
6     polite_single_shot     r1          164
7     polite_single_shot     r2          164
8     polite_single_shot     r3          164
9     think_step_by_step     r1          164
10    think_step_by_step     r2          164
11    think_step_by_step     r3          164


Detailed info for run: baseline_single_shot - r2
Number of tasks: 164
Number of cycles: 164
Total measurements: 13288
Total tokens generated: 86764
Measurements per token: 0.15


First 10 tasks - Token counts and measurement allocation:
  Task 0:  512 tokens ->   78 measurements (indices [    0,    78))
  Task 1:  512 tokens 

# finding relevant sections

In [24]:
import re

def extract_valuable_response(response: str, prompt: str) -> str:
    """
    Extract the valuable portion of the response, truncating after the last meaningful content.
    
    Args:
        response: Raw model response
        prompt: The prompt that was sent to the model
    
    Returns:
        Truncated response containing only valuable content
    """
    # Remove the original prompt if it appears at the start
    if response.startswith(prompt):
        response = response[len(prompt):]
    
    # Strategy 1: Extract markdown code blocks
    code_block_pattern = r"```(?:python)?\s*\n?(.*?)```"
    matches = re.findall(code_block_pattern, response, re.DOTALL)
    if matches:
        # Find the position of the last code block
        last_match_end = response.rfind("```")
        if last_match_end != -1:
            # Find the closing ``` 
            return response[:last_match_end + 3]
    
    # Strategy 2: Find function/class definitions
    # Look for the last function or class definition
    func_class_pattern = r"(def\s+\w+|class\s+\w+)"
    all_matches = list(re.finditer(func_class_pattern, response))
    
    if all_matches:
        # Find the last function/class definition
        last_def_start = all_matches[-1].start()
        
        # Try to find the end of this definition
        # Look for: empty lines, next def/class, or common stop markers
        truncate_pos = len(response)
        
        # Check for patterns that indicate end of code
        remaining = response[last_def_start:]
        
        # Find next def/class after some content
        next_def = re.search(r'\n(def\s+\w+|class\s+\w+)', remaining[50:])
        if next_def:
            truncate_pos = last_def_start + 50 + next_def.start()
        
        # Look for common stop sequences
        stop_patterns = [
            r'\n\n\n+',  # Multiple blank lines
            r'\nif __name__',  # Main block
            r'\n#\s*test',  # Test comments (case insensitive)
            r'\n#\s*example',  # Example comments
            r'\nprint\(',  # Print statements (often tests)
        ]
        
        for pattern in stop_patterns:
            match = re.search(pattern, remaining, re.IGNORECASE)
            if match and match.start() > 20:  # Only if we have some content
                truncate_pos = min(truncate_pos, last_def_start + match.start())
        
        return response[:truncate_pos].rstrip()
    
    # Strategy 3: If no clear code structure, look for natural end points
    # Remove trailing test/example code
    stop_sequences = [
        (r'\n\n\n+', 0),  # Multiple blank lines
        (r'\nif __name__', 0),
        (r'\n#.*test', 0),
        (r'\n#.*example', 0),
        (r'\nprint\(', 0),
        (r'\nassert\s', 0),  # Assertions
    ]
    
    result = response
    for pattern, offset in stop_sequences:
        match = re.search(pattern, result, re.IGNORECASE)
        if match and match.start() > 50:  # Ensure we have some content
            result = result[:match.start() + offset]
            break
    
    return result.rstrip()