<a href="https://colab.research.google.com/github/Xue-Zhiming-SMU/Post-Training-Quantization-for-Qwen2.5-3B-Models/blob/main/Copy_of_GenAI_project_BnB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers torch accelerate
!pip install -U datasets
!pip install -U bitsandbytes>=0.39.0
!pip install pynvml torch-summary

# PTQ Evaluation Using BitsAndBytes

In [2]:
# Import libraries
import torch
import torch.nn.functional as F
import time
import random
import gc
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import json
import os
import pynvml
import threading
import statistics
from datasets import load_dataset, DownloadMode

## MMLU Evaluation


In [None]:
# Load MMLU test dataset
from datasets import load_dataset
mmlu_test = load_dataset("cais/mmlu", "all", split="test")

print('----------------------')
print(f"Test set size: {len(mmlu_test)}")
print('----------------------')
print(mmlu_test)

In [5]:
# Set a seed for reproducibility
import random
random.seed(42)

# Take a random sample of 200 from the test dataset
samples_200_mmlu_test = random.sample(list(mmlu_test), 200)

In [6]:
def evaluate_mmlu(model, tokenizer, dataset, max_new_tokens=1):
    # --- Existing setup ---
    start_time = time.time()
    total_tokens_processed = 0
    results = {}
    correct = 0
    total_perplexity = 0
    perplexity_count = 0
    choice_letters = ["A", "B", "C", "D"]
    # --- New tracking lists ---
    memory_readings_gb = [] # To store memory allocated per step
    previous_memory_allocated = 0 # For leak check

    # --- Existing loop ---
    for i, sample in enumerate(tqdm(dataset)):
        # --- For Memory Leak Check ---
        memory_before_sample_gb = torch.cuda.memory_allocated() / (1024 ** 3)
        if i > 0 and memory_before_sample_gb > previous_memory_allocated:
             pass # You can add more sophisticated logging here if needed

        subject = sample["subject"]
        if subject not in results:
             # ... (rest of results initialization) ...
             results[subject] = {"correct": 0, "total": 0, "perplexity": 0, "exact_matches": 0}

        results[subject]["total"] += 1
        question = sample["question"] + "\n"
        # ... (rest of question formatting) ...
        for j, choice_text in enumerate(sample["choices"]):
            choice_letter = choice_letters[j]
            question += f"{choice_letter}. {choice_text}\n"
        question += "Answer:"
        correct_idx = sample["answer"]
        correct_letter = choice_letters[correct_idx]
        inputs = tokenizer(question, return_tensors="pt").to(model.device)
        input_length = inputs.input_ids.shape[1]

        # --- Record Memory BEFORE major operations for this sample ---
        # Taking it here captures memory state just before inference/generation
        current_memory_allocated_gb = torch.cuda.memory_allocated() / (1024 ** 3)
        memory_readings_gb.append(current_memory_allocated_gb)

        with torch.no_grad():
            # --- Existing Perplexity Calculation ---
            try:
                outputs = model(inputs.input_ids, labels=inputs.input_ids)
                loss = outputs.loss
                if not torch.isnan(loss) and not torch.isinf(loss):
                    perplexity = torch.exp(loss).item()
                    if 0 < perplexity < float('inf'):
                        total_perplexity += perplexity
                        results[subject]["perplexity"] += perplexity
                        perplexity_count += 1
                    # else: print(...) # Optional: keep filtering message
                # else: print(...) # Optional: keep NaN/Inf message
            except Exception as e:
                print(f"Perplexity calculation error: {e}")
                perplexity = None

            # --- Existing Generation ---
            gen_outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                pad_token_id=tokenizer.eos_token_id
            )

        # --- Existing Prediction Parsing ---
        generated_text = tokenizer.decode(gen_outputs[0], skip_special_tokens=True)
        predicted_text = generated_text[len(question):].strip()
        predicted_letter = None
        # ... (rest of prediction logic) ...
        if predicted_text in choice_letters: predicted_letter = predicted_text
        elif predicted_text and predicted_text[0] in choice_letters: predicted_letter = predicted_text[0]
        else:
             for letter in choice_letters:
                 if letter in predicted_text: predicted_letter = letter; break

        is_correct = predicted_letter == correct_letter
        if is_correct:
            correct += 1
            results[subject]["correct"] += 1

        generated_length = gen_outputs.shape[1] - input_length
        total_tokens_processed += generated_length

        # --- Update memory for leak check AFTER sample processing ---
        previous_memory_allocated = torch.cuda.memory_allocated() / (1024 ** 3)

    # --- Final Calculations ---
    total_time = time.time() - start_time
    accuracy = correct / len(dataset) if len(dataset) > 0 else 0
    avg_perplexity = (total_perplexity / perplexity_count) if perplexity_count > 0 else None
    total_throughput = total_tokens_processed / total_time if total_time > 0 else 0
    avg_inference_speed = total_throughput # Keep consistent if measuring same thing
    avg_inference_latency = (total_time * 1000) / total_tokens_processed if total_tokens_processed > 0 else 0

    # --- NEW: Calculate average memory from readings ---
    avg_memory_allocated_gb = statistics.mean(memory_readings_gb) if memory_readings_gb else 0

    # --- Peak memory still uses torch function (reset before calling evaluate_mmlu) ---
    max_memory_allocated_gb = torch.cuda.max_memory_allocated() / (1024 ** 3)

    for subject in results:
        results[subject]["accuracy"] = results[subject]["correct"] / results[subject]["total"] if results[subject]["total"] > 0 else 0

    return {
        "accuracy": accuracy,
        "correct": correct,
        "total": len(dataset),
        "subject_results": results,
        "inference_speed": avg_inference_speed,
        "throughput": total_throughput,
        "inference_latency": avg_inference_latency,
        "perplexity": avg_perplexity,
        "peak_gpu_memory_gb": max_memory_allocated_gb,
        "avg_gpu_memory_gb": avg_memory_allocated_gb, # Use the new calculated average
    }

In [7]:
# Function to get GPU utilization
gpu_utilization_readings = []
# Event to signal the polling thread to stop
stop_polling_event = threading.Event()

# Function for the polling thread
def poll_gpu_utilization(handle, interval=1.0):
    """Polls GPU utilization at a set interval."""
    global gpu_utilization_readings
    while not stop_polling_event.is_set():
        try:
            utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
            gpu_utilization_readings.append(utilization.gpu)
        except pynvml.NVMLError as e:
            print(f"NVML Error polling utilization: {e}")
            # Decide if you want to break or continue
            time.sleep(interval) # Wait before retrying or next poll
            continue # Skip appending if error occurred
        time.sleep(interval)

In [None]:
# --- FP32 Evaluation Cell ---

if 'handle' not in locals() or not handle:
    print("NVML handle not found. Attempting to initialize NVML...")
    try:
        pynvml.nvmlInit()
        handle = pynvml.nvmlDeviceGetHandleByIndex(0) # Assuming GPU 0
        print("NVML initialized successfully.")
        # Define the stop event here if this is the first initialization
        if 'stop_polling_event' not in locals():
             stop_polling_event = threading.Event()
    except pynvml.NVMLError as error:
        print(f"Failed to initialize NVML: {error}. Skipping GPU utilization polling.")
        handle = None # Ensure handle is None if init failed
elif 'stop_polling_event' not in locals():
     # If handle exists but event doesn't, create event
     print("NVML handle found, initializing stop event.")
     stop_polling_event = threading.Event()


# --- Polling Setup ---
# Clear previous readings and reset event for this specific evaluation
gpu_utilization_readings = []
if 'stop_polling_event' in locals(): # Ensure event exists before clearing
    stop_polling_event.clear()
else:
    # This case should ideally not happen if initialized correctly above/before
    print("Warning: stop_polling_event not defined. Polling might not stop correctly.")
    stop_polling_event = threading.Event() # Define as fallback

# Start GPU polling thread (only if NVML handle exists)
polling_thread = None
if handle:
    polling_thread = threading.Thread(target=poll_gpu_utilization, args=(handle, 1.0), daemon=True)
    polling_thread.start()
else:
    # Message already printed during init check if NVML failed
    pass

# --- Evaluation ---
# Clean memory before starting model loading/evaluation
torch.cuda.empty_cache()
gc.collect()

fp32_metrics = {}
fp32_results_data = {} # To store results

# Define model variable outside try for finally block access
model = None

try:
    # Load full precision model (Use your actual model name)
    model_name = "Qwen/Qwen2.5-3B"
    print(f"Loading model: {model_name} (FP32)...")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32, # Explicitly FP32
        device_map="cuda:0",
        trust_remote_code=True
    )
    print("Model loaded.")

    # Load tokenizer (can often reuse if already loaded)
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    # --- reset_peak_memory_stats REMOVED ---

    # Run evaluation
    print("Starting MMLU evaluation...")
    fp32_results_data = evaluate_mmlu(model, tokenizer, samples_200_mmlu_test, max_new_tokens=1)
    print("MMLU evaluation finished.")

    # Stop the polling thread AFTER evaluation is done (if it was started)
    avg_gpu_utilization = 'N/A'
    if polling_thread and polling_thread.is_alive():
        print("Stopping GPU polling thread...")
        stop_polling_event.set()
        polling_thread.join(timeout=5) # Add timeout for safety
        if polling_thread.is_alive():
             print("Warning: Polling thread did not stop in time.")
        if gpu_utilization_readings:
            avg_gpu_utilization = statistics.mean(gpu_utilization_readings)
            print(f"Average GPU Utilization: {avg_gpu_utilization:.2f}%")
        else:
             # If polling ran but got no readings (e.g., error in poll function)
             print("Polling ran but collected no readings.")
             avg_gpu_utilization = 0
        print("Polling stopped.")
    elif handle: # Polling should have started but didn't/failed early
        print("Polling thread was not running or failed to start correctly.")
        avg_gpu_utilization = 0
    else: # NVML wasn't initialized
        print("GPU polling was skipped (NVML not initialized).")
        avg_gpu_utilization = 'N/A'


    # Collect the desired metrics
    print("Collecting metrics...")
    fp32_metrics = {
        "PPL (Perplexity)": fp32_results_data.get('perplexity', 'N/A'),
        "Accuracy": fp32_results_data.get('accuracy', 'N/A') * 100,
        "Memory Footprint (Model Size) (GB)": model.get_memory_footprint() / (1024 ** 3),
        "Inference Latency (ms/token)": fp32_results_data.get('inference_latency', 'N/A'),
        "Avg GPU Utilization (%)": avg_gpu_utilization, # Use calculated average
        "Avg GPU Memory Allocated (GB)": fp32_results_data.get('avg_gpu_memory_gb', 'N/A'), # Use calculated average
    }

    # Print metrics in a formatted way
    print("\n===== FP32 DETAILED MODEL METRICS =====")
    print("-" * 50)
    # Ensure metrics dictionary is not empty before finding max length
    if fp32_metrics:
        max_key_length = max(len(key) for key in fp32_metrics.keys())
        for key, value in fp32_metrics.items():
             # Basic type checking for formatting
             if isinstance(value, (float, int)) and value != 'N/A':
                 print(f"{key.ljust(max_key_length)} : {value:.4f}")
             else:
                 print(f"{key.ljust(max_key_length)} : {value}")
    else:
        print("No metrics collected.")
    print("-" * 50)

except pynvml.NVMLError as nvml_error:
     print(f"NVML Error during FP32 evaluation steps: {nvml_error}")
except Exception as e:
    print(f"FP32 Evaluation error: {e}")
    # Ensure polling stops even if there's an error mid-evaluation
    if polling_thread and polling_thread.is_alive():
        print("Stopping polling thread due to evaluation error...")
        stop_polling_event.set()
        try:
            polling_thread.join(timeout=5)
        except Exception as join_e:
            print(f"Error stopping polling thread after exception: {join_e}")

finally:
    # --- Correct Cleanup (No NVML Shutdown Here) ---
    print("Starting cleanup for FP32 cell...")
    # Clean up GPU memory specific to this cell's model
    if 'model' in locals() and model is not None: # Check if model exists
        del model
        print("FP32 model deleted.")

    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared and garbage collected after FP32 run.")

In [None]:
# --- FP16 Evaluation Cell ---

# Assume 'handle' and 'stop_polling_event' exist from a previous cell.
if 'handle' not in locals() or not handle:
    print("Error: NVML handle not found. Please initialize NVML in a prior cell.")
    handle = None # Prevent polling
elif 'stop_polling_event' not in locals():
    print("Error: stop_polling_event not found. Please initialize in a prior cell.")
    # If needed, define fallback: stop_polling_event = threading.Event()


# Clear previous readings and reset event
gpu_utilization_readings = []
if 'stop_polling_event' in locals():
    stop_polling_event.clear()

# Start GPU polling thread
polling_thread = None
if handle:
    polling_thread = threading.Thread(target=poll_gpu_utilization, args=(handle, 1.0), daemon=True)
    polling_thread.start()

# Clean memory before starting
torch.cuda.empty_cache()
gc.collect()

fp16_metrics = {}
fp16_results_data = {}
model_fp16 = None # Define outside try

try:
    # Load FP16 model
    model_name = "Qwen/Qwen2.5-3B"
    print(f"Loading model: {model_name} (FP16)...")
    model_fp16 = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16, # Explicitly FP16
        device_map="cuda:0",
        trust_remote_code=True
    )
    print("Model loaded.")

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    # Run evaluation
    print("Starting MMLU evaluation...")
    fp16_results_data = evaluate_mmlu(model_fp16, tokenizer, samples_200_mmlu_test, max_new_tokens=1)
    print("MMLU evaluation finished.")

    # Stop polling thread and calculate average utilization
    avg_gpu_utilization = 'N/A'
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        polling_thread.join(timeout=5)
        if gpu_utilization_readings:
            avg_gpu_utilization = statistics.mean(gpu_utilization_readings)
        else:
             avg_gpu_utilization = 0 # Polling ran but got no readings
    elif handle:
        avg_gpu_utilization = 0 # Polling failed to start/run correctly

    # Collect metrics
    fp16_metrics = {
        "PPL (Perplexity)": fp16_results_data.get('perplexity', 'N/A'),
        "Accuracy": fp16_results_data.get('accuracy', 'N/A') * 100,
        "Memory Footprint (Model Size) (GB)": model_fp16.get_memory_footprint() / (1024 ** 3),
        "Inference Latency (ms/token)": fp16_results_data.get('inference_latency', 'N/A'),
        "Avg GPU Utilization (%)": avg_gpu_utilization,
        "Avg GPU Memory Allocated (GB)": fp16_results_data.get('avg_gpu_memory_gb', 'N/A'),
    }

    # Print metrics
    print("\n===== FP16 DETAILED MODEL METRICS =====")
    print("-" * 50)
    if fp16_metrics:
        max_key_length = max(len(key) for key in fp16_metrics.keys())
        for key, value in fp16_metrics.items():
             if isinstance(value, (float, int)) and value != 'N/A':
                 print(f"{key.ljust(max_key_length)} : {value:.4f}")
             else:
                 print(f"{key.ljust(max_key_length)} : {value}")
    else:
        print("No metrics collected.")
    print("-" * 50)

except pynvml.NVMLError as nvml_error:
     print(f"NVML Error during FP16 evaluation steps: {nvml_error}")
except Exception as e:
    print(f"FP16 Evaluation error: {e}")
    # Ensure polling stops on error
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        try:
            polling_thread.join(timeout=5)
        except Exception as join_e:
            print(f"Error stopping polling thread after exception: {join_e}")

finally:
    # Clean up GPU memory (No NVML Shutdown)
    print("Starting cleanup for FP16 cell...")
    if 'model_fp16' in locals() and model_fp16 is not None:
        del model_fp16
        print("FP16 model deleted.")
    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared and garbage collected after FP16 run.")

Loading model: Qwen/Qwen2.5-3B (FP16)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.
Loading tokenizer...
Tokenizer loaded.
Starting MMLU evaluation...


100%|██████████| 200/200 [00:21<00:00,  9.18it/s]


MMLU evaluation finished.

===== FP16 DETAILED MODEL METRICS =====
--------------------------------------------------
PPL (Perplexity)                   : 7.0464
Accuracy                           : 61.0000
Memory Footprint (Model Size) (GB) : 5.7480
Inference Latency (ms/token)       : 108.9004
Avg GPU Utilization (%)            : 57.9231
Avg GPU Memory Allocated (GB)      : 5.7913
--------------------------------------------------
Starting cleanup for FP16 cell...
FP16 model deleted.
GPU cache cleared and garbage collected after FP16 run.


In [None]:
# --- FP32 to NF4 Evaluation Cell ---

# Assume 'handle' and 'stop_polling_event' exist from a previous cell.
if 'handle' not in locals() or not handle:
    print("Error: NVML handle not found. Please initialize NVML in a prior cell.")
    handle = None # Prevent polling
elif 'stop_polling_event' not in locals():
    print("Error: stop_polling_event not found. Please initialize in a prior cell.")
    # If needed, define fallback: stop_polling_event = threading.Event()


# Clear previous readings and reset event
gpu_utilization_readings = []
if 'stop_polling_event' in locals():
    stop_polling_event.clear()

# Start GPU polling thread
polling_thread = None
if handle:
    polling_thread = threading.Thread(target=poll_gpu_utilization, args=(handle, 1.0), daemon=True)
    polling_thread.start()

# Clean memory before starting
torch.cuda.empty_cache()
gc.collect()

# Configure quantization to NF4
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float32,
    bnb_4bit_use_double_quant=True
)

nf4_metrics = {}
nf4_results_data = {}
model_quantized = None # Define outside try

try:
    # Load quantized model
    model_name = "Qwen/Qwen2.5-3B"
    print(f"Loading model: {model_name} (NF4 Quantized)...")
    model_quantized = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="cuda:0",
        trust_remote_code=True
    )
    print("Model loaded.")

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    # Run evaluation
    print("Starting MMLU evaluation...")
    nf4_results_data = evaluate_mmlu(model_quantized, tokenizer, samples_200_mmlu_test, max_new_tokens=1)
    print("MMLU evaluation finished.")

    # Stop polling thread and calculate average utilization
    avg_gpu_utilization = 'N/A'
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        polling_thread.join(timeout=5)
        if gpu_utilization_readings:
            avg_gpu_utilization = statistics.mean(gpu_utilization_readings)
        else:
             avg_gpu_utilization = 0 # Polling ran but got no readings
    elif handle:
        avg_gpu_utilization = 0 # Polling failed to start/run correctly

    # Collect metrics
    nf4_metrics = {
        "PPL (Perplexity)": nf4_results_data.get('perplexity', 'N/A'),
        "Accuracy": nf4_results_data.get('accuracy', 'N/A') * 100,
        "Memory Footprint (Model Size) (GB)": model_quantized.get_memory_footprint() / (1024 ** 3),
        "Inference Latency (ms/token)": nf4_results_data.get('inference_latency', 'N/A'),
        "Avg GPU Utilization (%)": avg_gpu_utilization,
        "Avg GPU Memory Allocated (GB)": nf4_results_data.get('avg_gpu_memory_gb', 'N/A'),
    }

    # Print metrics
    print("\n===== NF4 DETAILED MODEL METRICS =====")
    print("-" * 50)
    if nf4_metrics:
        max_key_length = max(len(key) for key in nf4_metrics.keys())
        for key, value in nf4_metrics.items():
             if isinstance(value, (float, int)) and value != 'N/A':
                 print(f"{key.ljust(max_key_length)} : {value:.4f}")
             else:
                 print(f"{key.ljust(max_key_length)} : {value}")
    else:
        print("No metrics collected.")
    print("-" * 50)

except pynvml.NVMLError as nvml_error:
     print(f"NVML Error during NF4 evaluation steps: {nvml_error}")
except Exception as e:
    print(f"NF4 Evaluation error: {e}")
    # Ensure polling stops on error
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        try:
            polling_thread.join(timeout=5)
        except Exception as join_e:
            print(f"Error stopping polling thread after exception: {join_e}")

finally:
    # Clean up GPU memory (No NVML Shutdown)
    print("Starting cleanup for NF4 cell...")
    if 'model_quantized' in locals() and model_quantized is not None:
        del model_quantized
        print("NF4 model deleted.")
    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared and garbage collected after NF4 run.")

Loading model: Qwen/Qwen2.5-3B (NF4 Quantized)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.
Loading tokenizer...
Tokenizer loaded.
Starting MMLU evaluation...


100%|██████████| 200/200 [01:22<00:00,  2.42it/s]


MMLU evaluation finished.

===== NF4 DETAILED MODEL METRICS =====
--------------------------------------------------
PPL (Perplexity)                   : 7.9842
Accuracy                           : 58.5000
Memory Footprint (Model Size) (GB) : 1.8720
Inference Latency (ms/token)       : 412.4234
Avg GPU Utilization (%)            : 79.9706
Avg GPU Memory Allocated (GB)      : 1.9578
--------------------------------------------------
Starting cleanup for NF4 cell...
NF4 model deleted.
GPU cache cleared and garbage collected after NF4 run.


In [None]:
# --- NF4 (FP16 Compute) Evaluation Cell ---

# Assume 'handle' and 'stop_polling_event' exist from a previous cell.
if 'handle' not in locals() or not handle:
    print("Error: NVML handle not found. Please initialize NVML in a prior cell.")
    handle = None # Prevent polling
elif 'stop_polling_event' not in locals():
    print("Error: stop_polling_event not found. Please initialize in a prior cell.")
    # If needed, define fallback: stop_polling_event = threading.Event()

# Clear previous readings and reset event
gpu_utilization_readings = []
if 'stop_polling_event' in locals():
    stop_polling_event.clear()

# Start GPU polling thread
polling_thread = None
if handle:
    polling_thread = threading.Thread(target=poll_gpu_utilization, args=(handle, 1.0), daemon=True)
    polling_thread.start()

# Clean memory before starting
torch.cuda.empty_cache()
gc.collect()

# Configure quantization to NF4 with FP16 compute dtype
bnb_config_fp16_compute = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, # Use FP16 for computation
    bnb_4bit_use_double_quant=True
)

nf4_fp16_compute_metrics = {}
nf4_fp16_results_data = {}
model_quantized_fp16_compute = None # Define outside try

try:
    # Load the quantized model with the FP16 compute config
    model_name = "Qwen/Qwen2.5-3B"
    print(f"Loading model: {model_name} (NF4 Quantized, FP16 Compute)...")
    model_quantized_fp16_compute = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config_fp16_compute, # Use the specific config
        device_map="cuda:0",
        trust_remote_code=True
    )
    print("Model loaded.")

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    # Run evaluation
    print("Starting MMLU evaluation...")
    nf4_fp16_results_data = evaluate_mmlu(model_quantized_fp16_compute, tokenizer, samples_200_mmlu_test, max_new_tokens=1)
    print("MMLU evaluation finished.")

    # Stop polling thread and calculate average utilization
    avg_gpu_utilization = 'N/A'
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        polling_thread.join(timeout=5)
        if gpu_utilization_readings:
            avg_gpu_utilization = statistics.mean(gpu_utilization_readings)
        else:
             avg_gpu_utilization = 0 # Polling ran but got no readings
    elif handle:
        avg_gpu_utilization = 0 # Polling failed to start/run correctly

    # Collect metrics
    nf4_fp16_compute_metrics = {
        "PPL (Perplexity)": nf4_fp16_results_data.get('perplexity', 'N/A'),
        "Accuracy": nf4_fp16_results_data.get('accuracy', 'N/A') * 100,
        "Memory Footprint (Model Size) (GB)": model_quantized_fp16_compute.get_memory_footprint() / (1024 ** 3),
        "Inference Latency (ms/token)": nf4_fp16_results_data.get('inference_latency', 'N/A'),
        "Avg GPU Utilization (%)": avg_gpu_utilization,
        "Avg GPU Memory Allocated (GB)": nf4_fp16_results_data.get('avg_gpu_memory_gb', 'N/A'),
    }

    # Print metrics
    print("\n===== NF4 (FP16 COMPUTE) DETAILED MODEL METRICS =====")
    print("-" * 50)
    if nf4_fp16_compute_metrics:
        max_key_length = max(len(key) for key in nf4_fp16_compute_metrics.keys())
        for key, value in nf4_fp16_compute_metrics.items():
             if isinstance(value, (float, int)) and value != 'N/A':
                 print(f"{key.ljust(max_key_length)} : {value:.4f}")
             else:
                 print(f"{key.ljust(max_key_length)} : {value}")
    else:
        print("No metrics collected.")
    print("-" * 50)

except pynvml.NVMLError as nvml_error:
     print(f"NVML Error during NF4 (FP16 Compute) evaluation steps: {nvml_error}")
except Exception as e:
    print(f"NF4 (FP16 Compute) Evaluation error: {e}")
    # Ensure polling stops on error
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        try:
            polling_thread.join(timeout=5)
        except Exception as join_e:
            print(f"Error stopping polling thread after exception: {join_e}")

finally:
    # Clean up GPU memory (No NVML Shutdown)
    print("Starting cleanup for NF4 (FP16 Compute) cell...")
    if 'model_quantized_fp16_compute' in locals() and model_quantized_fp16_compute is not None:
        del model_quantized_fp16_compute
        print("NF4 (FP16 compute) model deleted.")
    # Removed cleanup for 'model_fp16_temp' as it's no longer loaded

    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared and garbage collected after NF4 (FP16 Compute) run.")

Loading model: Qwen/Qwen2.5-3B (NF4 Quantized, FP16 Compute)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.
Loading tokenizer...
Tokenizer loaded.
Starting MMLU evaluation...


100%|██████████| 200/200 [00:39<00:00,  5.07it/s]


MMLU evaluation finished.

===== NF4 (FP16 COMPUTE) DETAILED MODEL METRICS =====
--------------------------------------------------
PPL (Perplexity)                   : 7.9823
Accuracy                           : 58.5000
Memory Footprint (Model Size) (GB) : 1.8720
Inference Latency (ms/token)       : 197.2757
Avg GPU Utilization (%)            : 47.0217
Avg GPU Memory Allocated (GB)      : 1.9578
--------------------------------------------------
Starting cleanup for NF4 (FP16 Compute) cell...
NF4 (FP16 compute) model deleted.
GPU cache cleared and garbage collected after NF4 (FP16 Compute) run.


In [None]:
# --- FP4 (FP32 Compute) Evaluation Cell ---

# Assume 'handle' and 'stop_polling_event' exist from a previous cell.
if 'handle' not in locals() or not handle:
    print("Error: NVML handle not found. Please initialize NVML in a prior cell.")
    handle = None # Prevent polling
elif 'stop_polling_event' not in locals():
    print("Error: stop_polling_event not found. Please initialize in a prior cell.")
    # If needed, define fallback: stop_polling_event = threading.Event()

# Clear previous readings and reset event
gpu_utilization_readings = []
if 'stop_polling_event' in locals():
    stop_polling_event.clear()

# Start GPU polling thread
polling_thread = None
if handle:
    polling_thread = threading.Thread(target=poll_gpu_utilization, args=(handle, 1.0), daemon=True)
    polling_thread.start()

# Clean memory before starting
torch.cuda.empty_cache()
gc.collect()

# Configure quantization to FP4 with FP32 compute dtype
bnb_config_fp4 = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="fp4",           # Quantization type is FP4
    bnb_4bit_compute_dtype=torch.float32,# Compute type is FP32
    bnb_4bit_use_double_quant=True
)

fp4_metrics = {}
fp4_results_data = {}
model_quantized_fp4 = None # Define outside try

try:
    # Load the quantized model with the FP4 config
    model_name = "Qwen/Qwen2.5-3B"
    print(f"Loading model: {model_name} (FP4 Quantized, FP32 Compute)...")
    model_quantized_fp4 = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config_fp4, # Use the FP4 config
        device_map="cuda:0",
        trust_remote_code=True
    )
    print("Model loaded.")

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    # Run evaluation
    print("Starting MMLU evaluation...")
    fp4_results_data = evaluate_mmlu(model_quantized_fp4, tokenizer, samples_200_mmlu_test, max_new_tokens=1)
    print("MMLU evaluation finished.")

    # Stop polling thread and calculate average utilization
    avg_gpu_utilization = 'N/A'
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        polling_thread.join(timeout=5)
        if gpu_utilization_readings:
            avg_gpu_utilization = statistics.mean(gpu_utilization_readings)
        else:
             avg_gpu_utilization = 0 # Polling ran but got no readings
    elif handle:
        avg_gpu_utilization = 0 # Polling failed to start/run correctly

    # Collect metrics
    fp4_metrics = {
        "PPL (Perplexity)": fp4_results_data.get('perplexity', 'N/A'),
        "Accuracy": fp4_results_data.get('accuracy', 'N/A') * 100,
        "Memory Footprint (Model Size) (GB)": model_quantized_fp4.get_memory_footprint() / (1024 ** 3),
        "Inference Latency (ms/token)": fp4_results_data.get('inference_latency', 'N/A'),
        "Avg GPU Utilization (%)": avg_gpu_utilization,
        "Avg GPU Memory Allocated (GB)": fp4_results_data.get('avg_gpu_memory_gb', 'N/A'),
    }

    # Print metrics
    print("\n===== FP4 (FP32 COMPUTE) DETAILED MODEL METRICS =====")
    print("-" * 50)
    if fp4_metrics:
        max_key_length = max(len(key) for key in fp4_metrics.keys())
        for key, value in fp4_metrics.items():
             if isinstance(value, (float, int)) and value != 'N/A':
                 print(f"{key.ljust(max_key_length)} : {value:.4f}")
             else:
                 print(f"{key.ljust(max_key_length)} : {value}")
    else:
        print("No metrics collected.")
    print("-" * 50)

except pynvml.NVMLError as nvml_error:
     print(f"NVML Error during FP4 (FP32 Compute) evaluation steps: {nvml_error}")
except Exception as e:
    print(f"FP4 (FP32 Compute) Evaluation error: {e}")
    # Ensure polling stops on error
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        try:
            polling_thread.join(timeout=5)
        except Exception as join_e:
            print(f"Error stopping polling thread after exception: {join_e}")

finally:
    # Clean up GPU memory (No NVML Shutdown)
    print("Starting cleanup for FP4 (FP32 Compute) cell...")
    if 'model_quantized_fp4' in locals() and model_quantized_fp4 is not None:
        del model_quantized_fp4
        print("FP4 (FP32 Compute) model deleted.")

    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared and garbage collected after FP4 (FP32 Compute) run.")

Loading model: Qwen/Qwen2.5-3B (FP4 Quantized, FP32 Compute)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.
Loading tokenizer...
Tokenizer loaded.
Starting MMLU evaluation...


100%|██████████| 200/200 [01:20<00:00,  2.47it/s]


MMLU evaluation finished.

===== FP4 (FP32 COMPUTE) DETAILED MODEL METRICS =====
--------------------------------------------------
PPL (Perplexity)                   : 9.2376
Accuracy                           : 50.5000
Memory Footprint (Model Size) (GB) : 1.8720
Inference Latency (ms/token)       : 404.7842
Avg GPU Utilization (%)            : 88.9111
Avg GPU Memory Allocated (GB)      : 1.9578
--------------------------------------------------
Starting cleanup for FP4 (FP32 Compute) cell...
FP4 (FP32 Compute) model deleted.
GPU cache cleared and garbage collected after FP4 (FP32 Compute) run.


In [None]:
# --- FP4 (FP16 Compute) Evaluation Cell ---

# Assume 'handle' and 'stop_polling_event' exist from a previous cell.
if 'handle' not in locals() or not handle:
    print("Error: NVML handle not found. Please initialize NVML in a prior cell.")
    handle = None # Prevent polling
elif 'stop_polling_event' not in locals():
    print("Error: stop_polling_event not found. Please initialize in a prior cell.")
    # If needed, define fallback: stop_polling_event = threading.Event()

# Clear previous readings and reset event
gpu_utilization_readings = []
if 'stop_polling_event' in locals():
    stop_polling_event.clear()

# Start GPU polling thread
polling_thread = None
if handle:
    polling_thread = threading.Thread(target=poll_gpu_utilization, args=(handle, 1.0), daemon=True)
    polling_thread.start()

# Clean memory before starting
torch.cuda.empty_cache()
gc.collect()

# Configure quantization to FP4 with FP16 compute dtype
bnb_config_fp4_fp16 = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="fp4",           # Quantization type is FP4
    bnb_4bit_compute_dtype=torch.float16,# Compute type is FP16
    bnb_4bit_use_double_quant=True
)

fp4_fp16_metrics = {}
fp4_fp16_results_data = {}
model_quantized_fp4_fp16 = None # Define outside try

try:
    # Load the quantized model with the FP4 (FP16 compute) config
    model_name = "Qwen/Qwen2.5-3B"
    print(f"Loading model: {model_name} (FP4 Quantized, FP16 Compute)...")
    model_quantized_fp4_fp16 = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config_fp4_fp16, # Use the FP4/FP16 config
        device_map="cuda:0",
        trust_remote_code=True
    )
    print("Model loaded.")

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    # Run evaluation
    print("Starting MMLU evaluation...")
    fp4_fp16_results_data = evaluate_mmlu(model_quantized_fp4_fp16, tokenizer, samples_200_mmlu_test, max_new_tokens=1)
    print("MMLU evaluation finished.")

    # Stop polling thread and calculate average utilization
    avg_gpu_utilization = 'N/A'
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        polling_thread.join(timeout=5)
        if gpu_utilization_readings:
            avg_gpu_utilization = statistics.mean(gpu_utilization_readings)
        else:
             avg_gpu_utilization = 0 # Polling ran but got no readings
    elif handle:
        avg_gpu_utilization = 0 # Polling failed to start/run correctly

    # Collect metrics
    fp4_fp16_metrics = {
        "PPL (Perplexity)": fp4_fp16_results_data.get('perplexity', 'N/A'),
        "Accuracy": fp4_fp16_results_data.get('accuracy', 'N/A') * 100,
        "Memory Footprint (Model Size) (GB)": model_quantized_fp4_fp16.get_memory_footprint() / (1024 ** 3),
        "Inference Latency (ms/token)": fp4_fp16_results_data.get('inference_latency', 'N/A'),
        "Avg GPU Utilization (%)": avg_gpu_utilization,
        "Avg GPU Memory Allocated (GB)": fp4_fp16_results_data.get('avg_gpu_memory_gb', 'N/A'),
    }

    # Print metrics
    print("\n===== FP4 (FP16 COMPUTE) DETAILED MODEL METRICS =====")
    print("-" * 50)
    if fp4_fp16_metrics:
        max_key_length = max(len(key) for key in fp4_fp16_metrics.keys())
        for key, value in fp4_fp16_metrics.items():
             if isinstance(value, (float, int)) and value != 'N/A':
                 print(f"{key.ljust(max_key_length)} : {value:.4f}")
             else:
                 print(f"{key.ljust(max_key_length)} : {value}")
    else:
        print("No metrics collected.")
    print("-" * 50)

except pynvml.NVMLError as nvml_error:
     print(f"NVML Error during FP4 (FP16 Compute) evaluation steps: {nvml_error}")
except Exception as e:
    print(f"FP4 (FP16 Compute) Evaluation error: {e}")
    # Ensure polling stops on error
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        try:
            polling_thread.join(timeout=5)
        except Exception as join_e:
            print(f"Error stopping polling thread after exception: {join_e}")

finally:
    # Clean up GPU memory (No NVML Shutdown)
    print("Starting cleanup for FP4 (FP16 Compute) cell...")
    if 'model_quantized_fp4_fp16' in locals() and model_quantized_fp4_fp16 is not None:
        del model_quantized_fp4_fp16
        print("FP4 (FP16 Compute) model deleted.")

    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared and garbage collected after FP4 (FP16 Compute) run.")

Loading model: Qwen/Qwen2.5-3B (FP4 Quantized, FP16 Compute)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.
Loading tokenizer...
Tokenizer loaded.
Starting MMLU evaluation...


100%|██████████| 200/200 [00:39<00:00,  5.06it/s]


MMLU evaluation finished.

===== FP4 (FP16 COMPUTE) DETAILED MODEL METRICS =====
--------------------------------------------------
PPL (Perplexity)                   : 9.2318
Accuracy                           : 51.0000
Memory Footprint (Model Size) (GB) : 1.8720
Inference Latency (ms/token)       : 197.4802
Avg GPU Utilization (%)            : 45.5652
Avg GPU Memory Allocated (GB)      : 1.9578
--------------------------------------------------
Starting cleanup for FP4 (FP16 Compute) cell...
FP4 (FP16 Compute) model deleted.
GPU cache cleared and garbage collected after FP4 (FP16 Compute) run.


In [None]:
# --- INT8 (FP32 Compute) Evaluation Cell ---

# Assume 'handle' and 'stop_polling_event' exist from a previous cell.
if 'handle' not in locals() or not handle:
    print("Error: NVML handle not found. Please initialize NVML in a prior cell.")
    handle = None # Prevent polling
elif 'stop_polling_event' not in locals():
    print("Error: stop_polling_event not found. Please initialize in a prior cell.")
    # If needed, define fallback: stop_polling_event = threading.Event()

# Clear previous readings and reset event
gpu_utilization_readings = []
if 'stop_polling_event' in locals():
    stop_polling_event.clear()

# Start GPU polling thread
polling_thread = None
if handle:
    polling_thread = threading.Thread(target=poll_gpu_utilization, args=(handle, 1.0), daemon=True)
    polling_thread.start()

# Clean memory before starting
torch.cuda.empty_cache()
gc.collect()

# Configure quantization to INT8 using load_in_8bit
bnb_config_int8 = BitsAndBytesConfig(
    load_in_8bit=True
)

int8_metrics = {}
int8_results_data = {}
model_quantized_int8 = None # Define outside try

try:
    # Load the INT8 quantized model
    model_name = "Qwen/Qwen2.5-3B"
    print(f"Loading model: {model_name} (INT8 Quantized)...")
    model_quantized_int8 = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config_int8, # Use the INT8 config
        device_map="cuda:0",
        trust_remote_code=True
    )
    print("Model loaded.")

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    # Run evaluation
    print("Starting MMLU evaluation...")
    int8_results_data = evaluate_mmlu(model_quantized_int8, tokenizer, samples_200_mmlu_test, max_new_tokens=1)
    print("MMLU evaluation finished.")

    # Stop polling thread and calculate average utilization
    avg_gpu_utilization = 'N/A'
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        polling_thread.join(timeout=5)
        if gpu_utilization_readings:
            avg_gpu_utilization = statistics.mean(gpu_utilization_readings)
        else:
             avg_gpu_utilization = 0 # Polling ran but got no readings
    elif handle:
        avg_gpu_utilization = 0 # Polling failed to start/run correctly

    # Collect metrics
    int8_metrics = {
        "PPL (Perplexity)": int8_results_data.get('perplexity', 'N/A'),
        "Accuracy": int8_results_data.get('accuracy', 'N/A') * 100,
        "Memory Footprint (Model Size) (GB)": model_quantized_int8.get_memory_footprint() / (1024 ** 3),
        "Inference Latency (ms/token)": int8_results_data.get('inference_latency', 'N/A'),
        "Avg GPU Utilization (%)": avg_gpu_utilization,
        "Avg GPU Memory Allocated (GB)": int8_results_data.get('avg_gpu_memory_gb', 'N/A'),
    }

    # Print metrics
    print("\n===== INT8 DETAILED MODEL METRICS =====")
    print("-" * 50)
    if int8_metrics:
        max_key_length = max(len(key) for key in int8_metrics.keys())
        for key, value in int8_metrics.items():
             if isinstance(value, (float, int)) and value != 'N/A':
                 print(f"{key.ljust(max_key_length)} : {value:.4f}")
             else:
                 print(f"{key.ljust(max_key_length)} : {value}")
    else:
        print("No metrics collected.")
    print("-" * 50)

except pynvml.NVMLError as nvml_error:
     print(f"NVML Error during INT8 evaluation steps: {nvml_error}")
except Exception as e:
    print(f"INT8 Evaluation error: {e}")
    # Ensure polling stops on error
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        try:
            polling_thread.join(timeout=5)
        except Exception as join_e:
            print(f"Error stopping polling thread after exception: {join_e}")

finally:
    # Clean up GPU memory (No NVML Shutdown)
    print("Starting cleanup for INT8 cell...")
    if 'model_quantized_int8' in locals() and model_quantized_int8 is not None:
        del model_quantized_int8
        print("INT8 model deleted.")

    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared and garbage collected after INT8 run.")

Loading model: Qwen/Qwen2.5-3B (INT8 Quantized)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.
Loading tokenizer...
Tokenizer loaded.
Starting MMLU evaluation...


100%|██████████| 200/200 [01:21<00:00,  2.44it/s]


MMLU evaluation finished.

===== INT8 DETAILED MODEL METRICS =====
--------------------------------------------------
PPL (Perplexity)                   : 7.0461
Accuracy                           : 60.0000
Memory Footprint (Model Size) (GB) : 3.1640
Inference Latency (ms/token)       : 409.0462
Avg GPU Utilization (%)            : 20.2667
Avg GPU Memory Allocated (GB)      : 3.2600
--------------------------------------------------
Starting cleanup for INT8 cell...
INT8 model deleted.
GPU cache cleared and garbage collected after INT8 run.


In [None]:
# --- INT8 (FP16 Compute) Evaluation Cell ---

# Assume 'handle' and 'stop_polling_event' exist from a previous cell.
if 'handle' not in locals() or not handle:
    print("Error: NVML handle not found. Please initialize NVML in a prior cell.")
    handle = None # Prevent polling
elif 'stop_polling_event' not in locals():
    print("Error: stop_polling_event not found. Please initialize in a prior cell.")
    # If needed, define fallback: stop_polling_event = threading.Event()

# Clear previous readings and reset event
gpu_utilization_readings = []
if 'stop_polling_event' in locals():
    stop_polling_event.clear()

# Start GPU polling thread
polling_thread = None
if handle:
    polling_thread = threading.Thread(target=poll_gpu_utilization, args=(handle, 1.0), daemon=True)
    polling_thread.start()

# Clean memory before starting
torch.cuda.empty_cache()
gc.collect()

# Configure quantization to INT8, explicitly requesting FP16 compute dtype
bnb_config_int8_fp16 = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16 # Explicitly requested compute dtype
)

int8_fp16_metrics = {}
int8_fp16_results_data = {}
model_quantized_int8_fp16 = None # Define outside try

try:
    # Load the INT8 quantized model with specified compute config
    model_name = "Qwen/Qwen2.5-3B"
    print(f"Loading model: {model_name} (INT8 Quantized, FP16 Compute)...")
    model_quantized_int8_fp16 = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config_int8_fp16, # Use the INT8/FP16 config
        device_map="cuda:0",
        trust_remote_code=True
    )
    print("Model loaded.")

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    # Run evaluation
    print("Starting MMLU evaluation...")
    int8_fp16_results_data = evaluate_mmlu(model_quantized_int8_fp16, tokenizer, samples_200_mmlu_test, max_new_tokens=1)
    print("MMLU evaluation finished.")

    # Stop polling thread and calculate average utilization
    avg_gpu_utilization = 'N/A'
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        polling_thread.join(timeout=5)
        if gpu_utilization_readings:
            avg_gpu_utilization = statistics.mean(gpu_utilization_readings)
        else:
             avg_gpu_utilization = 0 # Polling ran but got no readings
    elif handle:
        avg_gpu_utilization = 0 # Polling failed to start/run correctly

    # Collect metrics
    int8_fp16_metrics = {
        "PPL (Perplexity)": int8_fp16_results_data.get('perplexity', 'N/A'),
        "Accuracy": int8_fp16_results_data.get('accuracy', 'N/A') * 100,
        "Memory Footprint (Model Size) (GB)": model_quantized_int8_fp16.get_memory_footprint() / (1024 ** 3),
        "Inference Latency (ms/token)": int8_fp16_results_data.get('inference_latency', 'N/A'),
        "Avg GPU Utilization (%)": avg_gpu_utilization,
        "Avg GPU Memory Allocated (GB)": int8_fp16_results_data.get('avg_gpu_memory_gb', 'N/A'),
    }

    # Print metrics
    print("\n===== INT8 (FP16 COMPUTE) DETAILED MODEL METRICS =====")
    print("-" * 50)
    if int8_fp16_metrics:
        max_key_length = max(len(key) for key in int8_fp16_metrics.keys())
        for key, value in int8_fp16_metrics.items():
             if isinstance(value, (float, int)) and value != 'N/A':
                 print(f"{key.ljust(max_key_length)} : {value:.4f}")
             else:
                 print(f"{key.ljust(max_key_length)} : {value}")
    else:
        print("No metrics collected.")
    print("-" * 50)

except pynvml.NVMLError as nvml_error:
     print(f"NVML Error during INT8 (FP16 Compute) evaluation steps: {nvml_error}")
except Exception as e:
    print(f"INT8 (FP16 Compute) Evaluation error: {e}")
    # Ensure polling stops on error
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        try:
            polling_thread.join(timeout=5)
        except Exception as join_e:
            print(f"Error stopping polling thread after exception: {join_e}")

finally:
    # Clean up GPU memory (No NVML Shutdown)
    print("Starting cleanup for INT8 (FP16 Compute) cell...")
    if 'model_quantized_int8_fp16' in locals() and model_quantized_int8_fp16 is not None:
        del model_quantized_int8_fp16
        print("INT8 (FP16 compute) model deleted.")

    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared and garbage collected after INT8 (FP16 Compute) run.")

Loading model: Qwen/Qwen2.5-3B (INT8 Quantized, FP16 Compute)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.
Loading tokenizer...
Tokenizer loaded.
Starting MMLU evaluation...


100%|██████████| 200/200 [01:21<00:00,  2.45it/s]


MMLU evaluation finished.

===== INT8 (FP16 COMPUTE) DETAILED MODEL METRICS =====
--------------------------------------------------
PPL (Perplexity)                   : 7.0461
Accuracy                           : 60.0000
Memory Footprint (Model Size) (GB) : 3.1640
Inference Latency (ms/token)       : 409.0026
Avg GPU Utilization (%)            : 20.3636
Avg GPU Memory Allocated (GB)      : 3.2600
--------------------------------------------------
Starting cleanup for INT8 (FP16 Compute) cell...
INT8 (FP16 compute) model deleted.
GPU cache cleared and garbage collected after INT8 (FP16 Compute) run.


In [None]:
# Load ARC test dataset
arc_test = load_dataset("allenai/ai2_arc", "ARC-Easy", split="test")

print('----------------------')
print(f"Test set size: {len(arc_test)}")
print('----------------------')
print(arc_test)

README.md:   0%|          | 0.00/9.00k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/331k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


test-00000-of-00001.parquet:   0%|          | 0.00/346k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


validation-00000-of-00001.parquet:   0%|          | 0.00/86.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2376 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/570 [00:00<?, ? examples/s]

----------------------
Test set size: 2376
----------------------
Dataset({
    features: ['id', 'question', 'choices', 'answerKey'],
    num_rows: 2376
})


In [None]:
# Set a seed for reproducibility
random.seed(42)

# Take a random sample of 200 from the test dataset
samples_200_arc_test = random.sample(list(arc_test), 200)

In [None]:
import time
import torch
import statistics # Needed for mean calculation
from tqdm.auto import tqdm # Use auto version for better notebook compatibility

def evaluate_arc(model, tokenizer, dataset, max_new_tokens=1):
    """
    Comprehensive evaluation function for ARC dataset
    Aligned with MMLU evaluation approach, including average memory tracking.
    """
    # Performance tracking
    start_time = time.time()
    total_tokens_processed = 0
    results = {}
    gpu_memory_readings_gb = [] # <<< Initialize list to store memory readings

    # Accuracy tracking
    correct = 0
    total_perplexity = 0
    perplexity_count = 0
    choice_letters = ["A", "B", "C", "D"] # Keep for reference if needed later

    # Ensure model is on GPU if available for memory tracking
    device = model.device
    if not str(device).startswith("cuda"):
        print("Warning: Model is not on CUDA device. GPU memory metrics will be 0.")

    # Reset peak memory stats *before* the loop if you want peak for this specific eval
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats(device=device)

    for sample in tqdm(dataset, desc="Evaluating ARC"):
        # Extract subject/category (using 'id' as category source for ARC)
        # ARC doesn't have formal subjects like MMLU, using the first part of ID or 'default'
        subject = sample['id'].split('_')[0] if '_' in sample['id'] else 'default'

        # Initialize subject results (renamed key for clarity)
        if subject not in results:
            results[subject] = {
                "correct": 0,
                "total": 0,
                "perplexity_sum": 0, # Store sum for later averaging per category
                "perplexity_count": 0
                # "exact_matches" was not used, removed for now
            }

        results[subject]["total"] += 1

        # Format question and choices
        question = sample["question"] + "\n"
        choices = sample["choices"]["text"]
        choice_labels = sample["choices"]["label"] # e.g., ['A', 'B', 'C', 'D'] or [1, 2, 3, 4]

        # Handle numeric or letter choice labels consistently
        choice_map = {}
        formatted_choices = []
        for i, choice_text in enumerate(choices):
            choice_label = str(choice_labels[i]) # Ensure string
            formatted_choices.append(f"{choice_label}. {choice_text}")
            choice_map[choice_label] = choice_text # Store for checking prediction
        question += "\n".join(formatted_choices) + "\nAnswer:"

        # Get the correct answer key
        correct_label = str(sample["answerKey"]) # Ensure string

        # Tokenize and generate
        try:
            inputs = tokenizer(question, return_tensors="pt").to(device)
            input_length = inputs.input_ids.shape[1]
        except Exception as e:
            print(f"Tokenization error for sample ID {sample.get('id', 'N/A')}: {e}")
            continue # Skip this sample

        # Performance and accuracy tracking within no_grad context
        with torch.no_grad():
            # Robust Perplexity Calculation
            try:
                outputs = model(inputs.input_ids, labels=inputs.input_ids)
                loss = outputs.loss

                if loss is not None and not torch.isnan(loss) and not torch.isinf(loss):
                    perplexity_val = torch.exp(loss).item()
                    if 0 < perplexity_val < float('inf'): # Filter extreme values
                        total_perplexity += perplexity_val
                        results[subject]["perplexity_sum"] += perplexity_val
                        results[subject]["perplexity_count"] += 1
                        perplexity_count += 1
                    # else: print(f"Filtered extreme perplexity: {perplexity_val}") # Optional: Debugging
                # else: print("Skipping NaN/Inf/None loss") # Optional: Debugging
            except Exception as e:
                print(f"Perplexity calculation error for sample ID {sample.get('id', 'N/A')}: {e}")
                # perplexity_val = None # Not strictly needed here

            # Generate outputs
            try:
                gen_outputs = model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    pad_token_id=tokenizer.eos_token_id,
                    do_sample=False # Use greedy decoding for consistency
                )
                generated_length = gen_outputs.shape[1] - input_length
                total_tokens_processed += generated_length # Count only generated tokens

                # Decode the generated part
                generated_token_ids = gen_outputs[0, input_length:]
                predicted_text = tokenizer.decode(generated_token_ids, skip_special_tokens=True).strip()

            except Exception as e:
                 print(f"Generation error for sample ID {sample.get('id', 'N/A')}: {e}")
                 predicted_text = "" # Assume failure
                 generated_length = 0 # Don't count tokens if generation failed

            # --- Track GPU Memory After Inference Step ---
            if torch.cuda.is_available():
                gpu_memory_readings_gb.append(torch.cuda.memory_allocated(device=device) / (1024 ** 3))
            # ---------------------------------------------

        # Simplified Prediction Logic (Handles cases like "A", "A.", "A)")
        predicted_label = None
        cleaned_prediction = predicted_text.strip().upper() # Normalize prediction slightly
        if cleaned_prediction:
            # Check if the first character is a valid choice label
            first_char = cleaned_prediction[0]
            if first_char in choice_map:
                 predicted_label = first_char

        # Accuracy tracking
        is_correct = (predicted_label == correct_label)
        if is_correct:
            correct += 1
            results[subject]["correct"] += 1

    # <<< Final calculations >>>

    total_time = time.time() - start_time
    # Avoid division by zero if dataset is empty or no tokens processed
    num_samples = len(dataset)
    accuracy = (correct / num_samples) if num_samples > 0 else 0

    # Robust average perplexity calculation
    avg_perplexity = (total_perplexity / perplexity_count) if perplexity_count > 0 else None

    # Performance metrics
    avg_inference_latency_ms = ((total_time * 1000) / total_tokens_processed) if total_tokens_processed > 0 else 0
    # Note: Speed/Throughput definitions can vary. Here: Avg tokens/sec over total time.
    avg_tokens_per_sec = total_tokens_processed / total_time if total_time > 0 else 0

    # --- Calculate Average and Peak Memory ---
    avg_gpu_memory_allocated_gb = 0
    peak_gpu_memory_gb = 0
    if torch.cuda.is_available():
        if gpu_memory_readings_gb:
            # Use statistics.mean for safety
            try:
                avg_gpu_memory_allocated_gb = statistics.mean(gpu_memory_readings_gb)
            except statistics.StatisticsError: # Handle empty list case just in case
                 avg_gpu_memory_allocated_gb = 0
        # Get peak memory recorded *during* the loop
        peak_gpu_memory_gb = torch.cuda.max_memory_allocated(device=device) / (1024 ** 3)
    # -----------------------------------------

    # Calculate subject-level accuracy and average perplexity
    category_results_final = {}
    for subject, data in results.items():
        cat_accuracy = (data["correct"] / data["total"]) if data["total"] > 0 else 0
        cat_avg_perplexity = (data["perplexity_sum"] / data["perplexity_count"]) if data["perplexity_count"] > 0 else None
        category_results_final[subject] = {
            "accuracy": cat_accuracy,
            "correct": data["correct"],
            "total": data["total"],
            "avg_perplexity": cat_avg_perplexity
        }

    # <<< Return Dictionary with Correct Keys >>>
    return {
        # Overall Metrics
        "accuracy": accuracy,
        "perplexity": avg_perplexity, # Overall average perplexity
        "inference_latency": avg_inference_latency_ms, # Renamed for clarity (ms/token)

        # Performance Metrics (Optional but potentially useful)
        # "inference_speed_tps": avg_tokens_per_sec, # Tokens per second (overall)
        # "total_time_sec": total_time,
        # "total_tokens_processed": total_tokens_processed,

        # Memory Metrics (Corrected)
        "peak_gpu_memory_gb": peak_gpu_memory_gb, # Peak during evaluation
        "avg_gpu_memory_gb": avg_gpu_memory_allocated_gb, # Average of readings during eval

        # Aggregate Counts
        # "total_correct": correct,
        # "total_samples": num_samples,

        # Category/Subject level results
        "category_results": category_results_final,
    }


In [None]:
# --- FP32 ARC-easy Benchmark Evaluation Cell ---

# Assume 'handle' and 'stop_polling_event' exist from a previous cell.
if 'handle' not in locals() or not handle:
    print("Error: NVML handle not found. Please initialize NVML in a prior cell.")
    handle = None # Prevent polling
elif 'stop_polling_event' not in locals():
    print("Error: stop_polling_event not found. Please initialize in a prior cell.")
    # If needed, define fallback: stop_polling_event = threading.Event()

# Clear previous readings and reset event
gpu_utilization_readings = []
if 'stop_polling_event' in locals():
    stop_polling_event.clear()

# Start GPU polling thread
polling_thread = None
if handle:
    polling_thread = threading.Thread(target=poll_gpu_utilization, args=(handle, 1.0), daemon=True)
    polling_thread.start()

# Clean memory before starting
torch.cuda.empty_cache()
gc.collect()

fp32_arc_metrics = {}
fp32_arc_results_data = {}
model_fp32 = None # Define outside try

try:
    # Load full precision model (FP32)
    model_name = "Qwen/Qwen2.5-3B"
    print(f"Loading model: {model_name} (FP32)...")
    model_fp32 = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,
        device_map="cuda:0",
        trust_remote_code=True
    )
    print("Model loaded.")

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    # --- No reset_peak_memory_stats needed for this metric set ---

    # Run ARC evaluation
    print("Starting ARC evaluation...")
    # *** Assuming evaluate_arc returns keys similar to evaluate_mmlu ***
    # Specifically: 'perplexity', 'accuracy', 'inference_latency', 'avg_gpu_memory_gb'
    fp32_arc_results_data = evaluate_arc(model_fp32, tokenizer, samples_200_arc_test, max_new_tokens=1)
    print("ARC evaluation finished.")

    # Stop polling thread and calculate average utilization
    avg_gpu_utilization = 'N/A'
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        polling_thread.join(timeout=5)
        if gpu_utilization_readings:
            avg_gpu_utilization = statistics.mean(gpu_utilization_readings)
        else:
             avg_gpu_utilization = 0 # Polling ran but got no readings
    elif handle:
        avg_gpu_utilization = 0 # Polling failed to start/run correctly

    # Collect metrics in the MMLU format
    fp32_arc_metrics = {
        "PPL (Perplexity)": fp32_arc_results_data.get('perplexity', 'N/A'),
        "Accuracy": fp32_arc_results_data.get('accuracy', 'N/A') * 100,
        "Memory Footprint (Model Size) (GB)": model_fp32.get_memory_footprint() / (1024 ** 3),
        "Inference Latency (ms/token)": fp32_arc_results_data.get('inference_latency', 'N/A'),
        "Avg GPU Utilization (%)": avg_gpu_utilization, # From polling
        "Avg GPU Memory Allocated (GB)": fp32_arc_results_data.get('avg_gpu_memory_gb', 'N/A'), # From evaluate_arc result
    }

    # Print metrics
    print("\n===== FP32 ARC DETAILED MODEL METRICS =====")
    print("-" * 60)
    if fp32_arc_metrics:
        # Ensure consistent key width for alignment
        try:
            max_key_length = max(len(key) for key in fp32_arc_metrics.keys())
        except ValueError: # Handle case where dict might be empty if error occurred before population
             max_key_length = 35 # Default width

        for key, value in fp32_arc_metrics.items():
             # Format floating point numbers, handle 'N/A'
             if isinstance(value, (float, int)):
                 # Use fixed precision for consistency
                 print(f"{key.ljust(max_key_length)} : {value:.4f}")
             else:
                 # Print N/A or other strings directly
                 print(f"{key.ljust(max_key_length)} : {value}")
    else:
        print("No metrics collected.")
    print("-" * 60)

except pynvml.NVMLError as nvml_error:
     print(f"NVML Error during FP32 ARC evaluation steps: {nvml_error}")
except Exception as e:
    print(f"FP32 ARC Evaluation error: {e}")
    # Ensure polling stops on error
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        try:
            polling_thread.join(timeout=5)
        except Exception as join_e:
            print(f"Error stopping polling thread after exception: {join_e}")

finally:
    # Clean up GPU memory (No NVML Shutdown)
    print("Starting cleanup for FP32 ARC cell...")
    if 'model_fp32' in locals() and model_fp32 is not None:
        del model_fp32
        print("FP32 model deleted.")

    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared and garbage collected after FP32 ARC run.")

Loading model: Qwen/Qwen2.5-3B (FP32)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.
Loading tokenizer...
Tokenizer loaded.
Starting ARC evaluation...


Evaluating ARC:   0%|          | 0/200 [00:00<?, ?it/s]

ARC evaluation finished.

===== FP32 ARC DETAILED MODEL METRICS =====
------------------------------------------------------------
PPL (Perplexity)                   : 6.4848
Accuracy                           : 90.5000
Memory Footprint (Model Size) (GB) : 11.4960
Inference Latency (ms/token)       : 174.9104
Avg GPU Utilization (%)            : 87.0769
Avg GPU Memory Allocated (GB)      : 11.5528
------------------------------------------------------------
Starting cleanup for FP32 ARC cell...
FP32 model deleted.
GPU cache cleared and garbage collected after FP32 ARC run.


In [None]:
# --- FP16 ARC-easy Benchmark Evaluation Cell ---

# Assume 'handle' and 'stop_polling_event' exist from a previous cell.
if 'handle' not in locals() or not handle:
    print("Error: NVML handle not found. Please initialize NVML in a prior cell.")
    handle = None # Prevent polling
elif 'stop_polling_event' not in locals():
    print("Error: stop_polling_event not found. Please initialize in a prior cell.")
    # If needed, define fallback: stop_polling_event = threading.Event()

# Clear previous readings and reset event
gpu_utilization_readings = []
if 'stop_polling_event' in locals():
    stop_polling_event.clear()

# Start GPU polling thread
polling_thread = None
if handle:
    polling_thread = threading.Thread(target=poll_gpu_utilization, args=(handle, 1.0), daemon=True)
    polling_thread.start()

# Clean memory before starting
torch.cuda.empty_cache()
gc.collect()

fp16_arc_metrics = {}
fp16_arc_results_data = {}
model_fp16 = None # Define outside try

try:
    # Load half precision model (FP16)
    model_name = "Qwen/Qwen2.5-3B" # Same base model
    print(f"Loading model: {model_name} (FP16)...")
    model_fp16 = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16, # <--- Use FP16
        device_map="cuda:0",
        trust_remote_code=True
    )
    print("Model loaded.")

    # Load tokenizer (usually the same for different precisions)
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    # --- No reset_peak_memory_stats needed for this metric set ---

    # Run ARC evaluation using the updated evaluate_arc function
    print("Starting ARC evaluation (FP16)...")
    # Assuming evaluate_arc can handle fp16 and returns the required keys
    fp16_arc_results_data = evaluate_arc(model_fp16, tokenizer, samples_200_arc_test, max_new_tokens=1)
    print("ARC evaluation finished.")

    # Stop polling thread and calculate average utilization
    avg_gpu_utilization = 'N/A'
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        polling_thread.join(timeout=5)
        if gpu_utilization_readings:
            avg_gpu_utilization = statistics.mean(gpu_utilization_readings)
        else:
             avg_gpu_utilization = 0 # Polling ran but got no readings
    elif handle:
        avg_gpu_utilization = 0 # Polling failed to start/run correctly

    # Collect metrics in the MMLU format using results from evaluate_arc
    accuracy_val = fp16_arc_results_data.get('accuracy', 'N/A')
    if isinstance(accuracy_val, (float, int)):
        accuracy_val *= 100 # Multiply only if it's a number

    fp16_arc_metrics = {
        "PPL (Perplexity)": fp16_arc_results_data.get('perplexity', 'N/A'),
        "Accuracy": accuracy_val,
        "Memory Footprint (Model Size) (GB)": model_fp16.get_memory_footprint() / (1024 ** 3),
        "Inference Latency (ms/token)": fp16_arc_results_data.get('inference_latency', 'N/A'),
        "Avg GPU Utilization (%)": avg_gpu_utilization, # From polling
        "Avg GPU Memory Allocated (GB)": fp16_arc_results_data.get('avg_gpu_memory_gb', 'N/A'), # From evaluate_arc result
    }

    # Print metrics
    print("\n===== FP16 ARC DETAILED MODEL METRICS =====")
    print("-" * 60)
    if fp16_arc_metrics:
        try:
            max_key_length = max(len(key) for key in fp16_arc_metrics.keys())
        except ValueError:
             max_key_length = 35 # Default width

        for key, value in fp16_arc_metrics.items():
             if isinstance(value, (float, int)):
                 print(f"{key.ljust(max_key_length)} : {value:.4f}")
             else:
                 print(f"{key.ljust(max_key_length)} : {value}") # Handles 'N/A'
    else:
        print("No metrics collected.")
    print("-" * 60)

except pynvml.NVMLError as nvml_error:
     print(f"NVML Error during FP16 ARC evaluation steps: {nvml_error}")
except Exception as e:
    print(f"FP16 ARC Evaluation error: {e}")
    # Ensure polling stops on error
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        try:
            polling_thread.join(timeout=5)
        except Exception as join_e:
            print(f"Error stopping polling thread after exception: {join_e}")

finally:
    # Clean up GPU memory (No NVML Shutdown)
    print("Starting cleanup for FP16 ARC cell...")
    if 'model_fp16' in locals() and model_fp16 is not None: # Check for model_fp16
        del model_fp16 # Delete model_fp16
        print("FP16 model deleted.")

    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared and garbage collected after FP16 ARC run.")

Loading model: Qwen/Qwen2.5-3B (FP16)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.
Loading tokenizer...
Tokenizer loaded.
Starting ARC evaluation (FP16)...


Evaluating ARC:   0%|          | 0/200 [00:00<?, ?it/s]

ARC evaluation finished.

===== FP16 ARC DETAILED MODEL METRICS =====
------------------------------------------------------------
PPL (Perplexity)                   : 6.4846
Accuracy                           : 90.5000
Memory Footprint (Model Size) (GB) : 5.7480
Inference Latency (ms/token)       : 105.6825
Avg GPU Utilization (%)            : 56.4800
Avg GPU Memory Allocated (GB)      : 5.7745
------------------------------------------------------------
Starting cleanup for FP16 ARC cell...
FP16 model deleted.
GPU cache cleared and garbage collected after FP16 ARC run.


In [None]:
# --- NF4 (FP32 Compute) ARC-easy Benchmark Evaluation Cell ---

# Assume 'handle' and 'stop_polling_event' exist from a previous cell.
if 'handle' not in locals() or not handle:
    print("Error: NVML handle not found. Please initialize NVML in a prior cell.")
    handle = None # Prevent polling
elif 'stop_polling_event' not in locals():
    print("Error: stop_polling_event not found. Please initialize in a prior cell.")
    # If needed, define fallback: stop_polling_event = threading.Event()

# Clear previous readings and reset event
gpu_utilization_readings = []
if 'stop_polling_event' in locals():
    stop_polling_event.clear()

# Start GPU polling thread
polling_thread = None
if handle:
    polling_thread = threading.Thread(target=poll_gpu_utilization, args=(handle, 1.0), daemon=True)
    polling_thread.start()

# Clean memory before starting
torch.cuda.empty_cache()
gc.collect()

nf4_arc_metrics = {}
nf4_arc_results_data = {}
model_nf4 = None # Define outside try

try:
    # Configure quantization to NF4
    print("Configuring NF4 quantization...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",  # Normalized Float 4 format
        bnb_4bit_compute_dtype=torch.float32, # Or torch.bfloat16 if supported and desired
        bnb_4bit_use_double_quant=True
    )
    print("Quantization config created.")

    # Load quantized model (NF4)
    model_name = "Qwen/Qwen2.5-3B" # Base model name
    print(f"Loading model: {model_name} (NF4 Quantized)...")
    model_nf4 = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config, # <--- Use NF4 quantization config
        device_map="cuda:0",
        trust_remote_code=True
    )
    print("Model loaded.")

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    # --- No reset_peak_memory_stats needed for this metric set ---

    # Run ARC evaluation using the evaluate_arc function
    print("Starting ARC evaluation (NF4)...")
    # Assuming evaluate_arc can handle the quantized model and returns required keys
    # Using max_new_tokens=1 for consistency with FP16/FP32 cells
    nf4_arc_results_data = evaluate_arc(model_nf4, tokenizer, samples_200_arc_test, max_new_tokens=1)
    print("ARC evaluation finished.")

    # Stop polling thread and calculate average utilization
    avg_gpu_utilization = 'N/A'
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        polling_thread.join(timeout=5)
        if gpu_utilization_readings:
            avg_gpu_utilization = statistics.mean(gpu_utilization_readings)
        else:
             avg_gpu_utilization = 0 # Polling ran but got no readings
    elif handle:
        avg_gpu_utilization = 0 # Polling failed to start/run correctly

    # Collect metrics in the MMLU format using results from evaluate_arc
    accuracy_val = nf4_arc_results_data.get('accuracy', 'N/A')
    if isinstance(accuracy_val, (float, int)):
        accuracy_val *= 100 # Multiply only if it's a number

    nf4_arc_metrics = {
        "PPL (Perplexity)": nf4_arc_results_data.get('perplexity', 'N/A'),
        "Accuracy": accuracy_val,
        "Memory Footprint (Model Size) (GB)": model_nf4.get_memory_footprint() / (1024 ** 3),
        "Inference Latency (ms/token)": nf4_arc_results_data.get('inference_latency', 'N/A'),
        "Avg GPU Utilization (%)": avg_gpu_utilization, # From polling
        "Avg GPU Memory Allocated (GB)": nf4_arc_results_data.get('avg_gpu_memory_gb', 'N/A'), # From evaluate_arc result
    }

    # Print metrics
    print("\n===== NF4 ARC DETAILED MODEL METRICS =====")
    print("-" * 60)
    if nf4_arc_metrics:
        try:
            max_key_length = max(len(key) for key in nf4_arc_metrics.keys())
        except ValueError:
             max_key_length = 35 # Default width

        for key, value in nf4_arc_metrics.items():
             if isinstance(value, (float, int)):
                 print(f"{key.ljust(max_key_length)} : {value:.4f}")
             else:
                 print(f"{key.ljust(max_key_length)} : {value}") # Handles 'N/A'
    else:
        print("No metrics collected.")
    print("-" * 60)

except pynvml.NVMLError as nvml_error:
     print(f"NVML Error during NF4 ARC evaluation steps: {nvml_error}")
except Exception as e:
    print(f"NF4 ARC Evaluation error: {e}")
    # Ensure polling stops on error
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        try:
            polling_thread.join(timeout=5)
        except Exception as join_e:
            print(f"Error stopping polling thread after exception: {join_e}")

finally:
    # Clean up GPU memory (No NVML Shutdown)
    print("Starting cleanup for NF4 ARC cell...")
    if 'model_nf4' in locals() and model_nf4 is not None: # Check for model_nf4
        del model_nf4 # Delete model_nf4
        print("NF4 model deleted.")

    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared and garbage collected after NF4 ARC run.")

Configuring NF4 quantization...
Quantization config created.
Loading model: Qwen/Qwen2.5-3B (NF4 Quantized)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.
Loading tokenizer...
Tokenizer loaded.
Starting ARC evaluation (NF4)...


Evaluating ARC:   0%|          | 0/200 [00:00<?, ?it/s]

ARC evaluation finished.

===== NF4 ARC DETAILED MODEL METRICS =====
------------------------------------------------------------
PPL (Perplexity)                   : 7.3644
Accuracy                           : 90.0000
Memory Footprint (Model Size) (GB) : 1.8720
Inference Latency (ms/token)       : 343.5057
Avg GPU Utilization (%)            : 86.4872
Avg GPU Memory Allocated (GB)      : 1.9410
------------------------------------------------------------
Starting cleanup for NF4 ARC cell...
NF4 model deleted.
GPU cache cleared and garbage collected after NF4 ARC run.


In [None]:
# --- NF4 (FP16 Compute) ARC-easy Benchmark Evaluation Cell --- # <<< Title updated for FP16 compute

# Assume 'handle' and 'stop_polling_event' exist from a previous cell.
if 'handle' not in locals() or not handle:
    print("Error: NVML handle not found. Please initialize NVML in a prior cell.")
    handle = None # Prevent polling
elif 'stop_polling_event' not in locals():
    print("Error: stop_polling_event not found. Please initialize in a prior cell.")
    # If needed, define fallback: stop_polling_event = threading.Event()

# Clear previous readings and reset event
gpu_utilization_readings = []
if 'stop_polling_event' in locals():
    stop_polling_event.clear()

# Start GPU polling thread
polling_thread = None
if handle:
    polling_thread = threading.Thread(target=poll_gpu_utilization, args=(handle, 1.0), daemon=True)
    polling_thread.start()

# Clean memory before starting
torch.cuda.empty_cache()
gc.collect()

nf4_arc_metrics = {}
nf4_arc_results_data = {}
model_nf4 = None # Define outside try

try:
    # Configure quantization to NF4 with FP16 compute
    print("Configuring NF4 quantization (FP16 Compute)...") # <<< Log updated
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",          # Normalized Float 4 format
        bnb_4bit_compute_dtype=torch.float16, # <--- Compute dtype set to FP16
        bnb_4bit_use_double_quant=True
    )
    print("Quantization config created.")

    # Load quantized model (NF4)
    model_name = "Qwen/Qwen2.5-3B" # Base model name
    print(f"Loading model: {model_name} (NF4 Quantized, FP16 Compute)...") # <<< Log updated
    model_nf4 = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config, # Use NF4 quantization config
        device_map="cuda:0",
        trust_remote_code=True
    )
    print("Model loaded.")
    # --- Removed loading of separate FP16 model ---

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    # --- No reset_peak_memory_stats needed for this metric set ---

    # Run ARC evaluation using the evaluate_arc function
    print("Starting ARC evaluation (NF4, FP16 Compute)...") # <<< Log updated
    # Assuming evaluate_arc can handle the quantized model and returns required keys
    # Using max_new_tokens=1 for consistency with the format
    nf4_arc_results_data = evaluate_arc(model_nf4, tokenizer, samples_200_arc_test, max_new_tokens=1) # <<< max_new_tokens changed to 1
    print("ARC evaluation finished.")

    # Stop polling thread and calculate average utilization
    avg_gpu_utilization = 'N/A'
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        polling_thread.join(timeout=5)
        if gpu_utilization_readings:
            avg_gpu_utilization = statistics.mean(gpu_utilization_readings)
        else:
             avg_gpu_utilization = 0 # Polling ran but got no readings
    elif handle:
        avg_gpu_utilization = 0 # Polling failed to start/run correctly

    # Collect metrics in the MMLU format using results from evaluate_arc
    accuracy_val = nf4_arc_results_data.get('accuracy', 'N/A')
    if isinstance(accuracy_val, (float, int)):
        accuracy_val *= 100 # Multiply only if it's a number

    nf4_arc_metrics = {
        "PPL (Perplexity)": nf4_arc_results_data.get('perplexity', 'N/A'),
        "Accuracy": accuracy_val,
        "Memory Footprint (Model Size) (GB)": model_nf4.get_memory_footprint() / (1024 ** 3),
        "Inference Latency (ms/token)": nf4_arc_results_data.get('inference_latency', 'N/A'),
        "Avg GPU Utilization (%)": avg_gpu_utilization, # From polling
        "Avg GPU Memory Allocated (GB)": nf4_arc_results_data.get('avg_gpu_memory_gb', 'N/A'), # From evaluate_arc result
    }

    # Print metrics
    # <<< Title updated for FP16 compute
    print("\n===== NF4 (FP16 Compute) ARC DETAILED MODEL METRICS =====")
    print("-" * 60)
    if nf4_arc_metrics:
        try:
            max_key_length = max(len(key) for key in nf4_arc_metrics.keys())
        except ValueError:
             max_key_length = 35 # Default width

        for key, value in nf4_arc_metrics.items():
             if isinstance(value, (float, int)):
                 print(f"{key.ljust(max_key_length)} : {value:.4f}")
             else:
                 print(f"{key.ljust(max_key_length)} : {value}") # Handles 'N/A'
    else:
        print("No metrics collected.")
    print("-" * 60)

except pynvml.NVMLError as nvml_error:
     print(f"NVML Error during NF4 (FP16 Compute) ARC evaluation steps: {nvml_error}") # <<< Log updated
except Exception as e:
    print(f"NF4 (FP16 Compute) ARC Evaluation error: {e}") # <<< Log updated
    # Ensure polling stops on error
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        try:
            polling_thread.join(timeout=5)
        except Exception as join_e:
            print(f"Error stopping polling thread after exception: {join_e}")

finally:
    # Clean up GPU memory (No NVML Shutdown)
    print("Starting cleanup for NF4 (FP16 Compute) ARC cell...")
    if 'model_nf4' in locals() and model_nf4 is not None: # Check for model_nf4
        del model_nf4 # Delete model_nf4
        print("NF4 model deleted.")

    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared and garbage collected after NF4 (FP16 Compute) ARC run.")

Configuring NF4 quantization (FP16 Compute)...
Quantization config created.
Loading model: Qwen/Qwen2.5-3B (NF4 Quantized, FP16 Compute)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.
Loading tokenizer...
Tokenizer loaded.
Starting ARC evaluation (NF4, FP16 Compute)...


Evaluating ARC:   0%|          | 0/200 [00:00<?, ?it/s]

ARC evaluation finished.

===== NF4 (FP16 Compute) ARC DETAILED MODEL METRICS =====
------------------------------------------------------------
PPL (Perplexity)                   : 7.3646
Accuracy                           : 90.0000
Memory Footprint (Model Size) (GB) : 1.8720
Inference Latency (ms/token)       : 194.0452
Avg GPU Utilization (%)            : 40.7292
Avg GPU Memory Allocated (GB)      : 1.9410
------------------------------------------------------------
Starting cleanup for NF4 (FP16 Compute) ARC cell...
NF4 model deleted.
GPU cache cleared and garbage collected after NF4 (FP16 Compute) ARC run.


In [None]:
# --- FP4 (FP32 Compute) ARC-easy Benchmark Evaluation Cell --- # <<< Title updated

# Assume 'handle' and 'stop_polling_event' exist from a previous cell.
if 'handle' not in locals() or not handle:
    print("Error: NVML handle not found. Please initialize NVML in a prior cell.")
    handle = None # Prevent polling
elif 'stop_polling_event' not in locals():
    print("Error: stop_polling_event not found. Please initialize in a prior cell.")
    # If needed, define fallback: stop_polling_event = threading.Event()

# Clear previous readings and reset event
gpu_utilization_readings = []
if 'stop_polling_event' in locals():
    stop_polling_event.clear()

# Start GPU polling thread
polling_thread = None
if handle:
    polling_thread = threading.Thread(target=poll_gpu_utilization, args=(handle, 1.0), daemon=True)
    polling_thread.start()

# Clean memory before starting
torch.cuda.empty_cache()
gc.collect()

fp4_arc_metrics = {}          # <<< Variable renamed
fp4_arc_results_data = {}     # <<< Variable renamed
model_fp4 = None # Define outside try <<< Variable renamed

try:
    # Configure quantization to FP4
    print("Configuring FP4 quantization (FP32 Compute)...") # <<< Log updated
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="fp4",             # <--- Set to FP4
        bnb_4bit_compute_dtype=torch.float32, # <--- Set to FP32 compute
        bnb_4bit_use_double_quant=True         # Or False depending on desired config
    )
    print("Quantization config created.")

    # Load quantized model (FP4)
    model_name = "Qwen/Qwen2.5-3B" # Base model name
    print(f"Loading model: {model_name} (FP4 Quantized, FP32 Compute)...") # <<< Log updated
    model_fp4 = AutoModelForCausalLM.from_pretrained( # <<< Variable renamed
        model_name,
        quantization_config=bnb_config, # Use FP4 quantization config
        device_map="cuda:0",
        trust_remote_code=True
    )
    print("Model loaded.")

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    # --- No reset_peak_memory_stats needed for this metric set ---

    # Run ARC evaluation using the evaluate_arc function
    print("Starting ARC evaluation (FP4, FP32 Compute)...") # <<< Log updated
    # Assuming evaluate_arc can handle the quantized model and returns required keys
    # Using max_new_tokens=1 for consistency with the format
    fp4_arc_results_data = evaluate_arc(model_fp4, tokenizer, samples_200_arc_test, max_new_tokens=1) # <<< Variable renamed, max_new_tokens=1
    print("ARC evaluation finished.")

    # Stop polling thread and calculate average utilization
    avg_gpu_utilization = 'N/A'
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        polling_thread.join(timeout=5)
        if gpu_utilization_readings:
            avg_gpu_utilization = statistics.mean(gpu_utilization_readings)
        else:
             avg_gpu_utilization = 0 # Polling ran but got no readings
    elif handle:
        avg_gpu_utilization = 0 # Polling failed to start/run correctly

    # Collect metrics in the MMLU format using results from evaluate_arc
    accuracy_val = fp4_arc_results_data.get('accuracy', 'N/A') # <<< Use renamed variable
    if isinstance(accuracy_val, (float, int)):
        accuracy_val *= 100 # Multiply only if it's a number

    fp4_arc_metrics = { # <<< Variable renamed
        "PPL (Perplexity)": fp4_arc_results_data.get('perplexity', 'N/A'),
        "Accuracy": accuracy_val,
        "Memory Footprint (Model Size) (GB)": model_fp4.get_memory_footprint() / (1024 ** 3), # <<< Use renamed variable
        "Inference Latency (ms/token)": fp4_arc_results_data.get('inference_latency', 'N/A'),
        "Avg GPU Utilization (%)": avg_gpu_utilization, # From polling
        "Avg GPU Memory Allocated (GB)": fp4_arc_results_data.get('avg_gpu_memory_gb', 'N/A'), # From evaluate_arc result
    }

    # Print metrics
    print("\n===== FP4 (FP32 Compute) ARC DETAILED MODEL METRICS =====") # <<< Title updated
    print("-" * 60)
    if fp4_arc_metrics: # <<< Use renamed variable
        try:
            max_key_length = max(len(key) for key in fp4_arc_metrics.keys()) # <<< Use renamed variable
        except ValueError:
             max_key_length = 35 # Default width

        for key, value in fp4_arc_metrics.items(): # <<< Use renamed variable
             if isinstance(value, (float, int)):
                 print(f"{key.ljust(max_key_length)} : {value:.4f}")
             else:
                 print(f"{key.ljust(max_key_length)} : {value}") # Handles 'N/A'
    else:
        print("No metrics collected.")
    print("-" * 60)

except pynvml.NVMLError as nvml_error:
     print(f"NVML Error during FP4 (FP32 Compute) ARC evaluation steps: {nvml_error}") # <<< Log updated
except Exception as e:
    print(f"FP4 (FP32 Compute) ARC Evaluation error: {e}") # <<< Log updated
    # Ensure polling stops on error
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        try:
            polling_thread.join(timeout=5)
        except Exception as join_e:
            print(f"Error stopping polling thread after exception: {join_e}")

finally:
    # Clean up GPU memory (No NVML Shutdown)
    print("Starting cleanup for FP4 (FP32 Compute) ARC cell...") # <<< Log updated
    if 'model_fp4' in locals() and model_fp4 is not None: # Check for model_fp4 <<< Use renamed variable
        del model_fp4 # Delete model_fp4 <<< Use renamed variable
        print("FP4 model deleted.")

    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared and garbage collected after FP4 (FP32 Compute) ARC run.")

Configuring FP4 quantization (FP32 Compute)...
Quantization config created.
Loading model: Qwen/Qwen2.5-3B (FP4 Quantized, FP32 Compute)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.
Loading tokenizer...
Tokenizer loaded.
Starting ARC evaluation (FP4, FP32 Compute)...


Evaluating ARC:   0%|          | 0/200 [00:00<?, ?it/s]

ARC evaluation finished.

===== FP4 (FP32 Compute) ARC DETAILED MODEL METRICS =====
------------------------------------------------------------
PPL (Perplexity)                   : 8.3459
Accuracy                           : 84.0000
Memory Footprint (Model Size) (GB) : 1.8720
Inference Latency (ms/token)       : 333.8665
Avg GPU Utilization (%)            : 84.8052
Avg GPU Memory Allocated (GB)      : 1.9410
------------------------------------------------------------
Starting cleanup for FP4 (FP32 Compute) ARC cell...
FP4 model deleted.
GPU cache cleared and garbage collected after FP4 (FP32 Compute) ARC run.


In [None]:
# --- FP4 (FP16 Compute) ARC-easy Benchmark Evaluation Cell --- # <<< Title updated

# Assume 'handle' and 'stop_polling_event' exist from a previous cell.
if 'handle' not in locals() or not handle:
    print("Error: NVML handle not found. Please initialize NVML in a prior cell.")
    handle = None # Prevent polling
elif 'stop_polling_event' not in locals():
    print("Error: stop_polling_event not found. Please initialize in a prior cell.")
    # If needed, define fallback: stop_polling_event = threading.Event()

# Clear previous readings and reset event
gpu_utilization_readings = []
if 'stop_polling_event' in locals():
    stop_polling_event.clear()

# Start GPU polling thread
polling_thread = None
if handle:
    polling_thread = threading.Thread(target=poll_gpu_utilization, args=(handle, 1.0), daemon=True)
    polling_thread.start()

# Clean memory before starting
torch.cuda.empty_cache()
gc.collect()

fp4_arc_metrics = {}          # <<< Variable renamed
fp4_arc_results_data = {}     # <<< Variable renamed
model_fp4 = None # Define outside try <<< Variable renamed

try:
    # Configure quantization to FP4 with FP16 compute
    print("Configuring FP4 quantization (FP16 Compute)...") # <<< Log updated
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="fp4",             # <--- Set to FP4
        bnb_4bit_compute_dtype=torch.float16, # <--- Set to FP16 compute
        bnb_4bit_use_double_quant=True         # Or False depending on desired config
    )
    print("Quantization config created.")

    # Load quantized model (FP4)
    model_name = "Qwen/Qwen2.5-3B" # Base model name
    print(f"Loading model: {model_name} (FP4 Quantized, FP16 Compute)...") # <<< Log updated
    model_fp4 = AutoModelForCausalLM.from_pretrained( # <<< Variable renamed
        model_name,
        quantization_config=bnb_config, # Use FP4 quantization config
        device_map="cuda:0",
        trust_remote_code=True
    )
    print("Model loaded.")
    # --- Removed loading of separate FP16 model ---

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    # --- No reset_peak_memory_stats needed for this metric set ---

    # Run ARC evaluation using the evaluate_arc function
    print("Starting ARC evaluation (FP4, FP16 Compute)...") # <<< Log updated
    # Assuming evaluate_arc can handle the quantized model and returns required keys
    # Using max_new_tokens=1 for consistency with the format
    fp4_arc_results_data = evaluate_arc(model_fp4, tokenizer, samples_200_arc_test, max_new_tokens=1) # <<< Variable renamed, max_new_tokens=1
    print("ARC evaluation finished.")

    # Stop polling thread and calculate average utilization
    avg_gpu_utilization = 'N/A'
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        polling_thread.join(timeout=5)
        if gpu_utilization_readings:
            avg_gpu_utilization = statistics.mean(gpu_utilization_readings)
        else:
             avg_gpu_utilization = 0 # Polling ran but got no readings
    elif handle:
        avg_gpu_utilization = 0 # Polling failed to start/run correctly

    # Collect metrics in the MMLU format using results from evaluate_arc
    accuracy_val = fp4_arc_results_data.get('accuracy', 'N/A') # <<< Use renamed variable
    if isinstance(accuracy_val, (float, int)):
        accuracy_val *= 100 # Multiply only if it's a number

    fp4_arc_metrics = { # <<< Variable renamed
        "PPL (Perplexity)": fp4_arc_results_data.get('perplexity', 'N/A'),
        "Accuracy": accuracy_val,
        "Memory Footprint (Model Size) (GB)": model_fp4.get_memory_footprint() / (1024 ** 3), # <<< Use renamed variable
        "Inference Latency (ms/token)": fp4_arc_results_data.get('inference_latency', 'N/A'),
        "Avg GPU Utilization (%)": avg_gpu_utilization, # From polling
        "Avg GPU Memory Allocated (GB)": fp4_arc_results_data.get('avg_gpu_memory_gb', 'N/A'), # From evaluate_arc result
    }

    # Print metrics
    print("\n===== FP4 (FP16 Compute) ARC DETAILED MODEL METRICS =====") # <<< Title updated
    print("-" * 60)
    if fp4_arc_metrics: # <<< Use renamed variable
        try:
            max_key_length = max(len(key) for key in fp4_arc_metrics.keys()) # <<< Use renamed variable
        except ValueError:
             max_key_length = 35 # Default width

        for key, value in fp4_arc_metrics.items(): # <<< Use renamed variable
             if isinstance(value, (float, int)):
                 print(f"{key.ljust(max_key_length)} : {value:.4f}")
             else:
                 print(f"{key.ljust(max_key_length)} : {value}") # Handles 'N/A'
    else:
        print("No metrics collected.")
    print("-" * 60)

except pynvml.NVMLError as nvml_error:
     print(f"NVML Error during FP4 (FP16 Compute) ARC evaluation steps: {nvml_error}") # <<< Log updated
except Exception as e:
    print(f"FP4 (FP16 Compute) ARC Evaluation error: {e}") # <<< Log updated
    # Ensure polling stops on error
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        try:
            polling_thread.join(timeout=5)
        except Exception as join_e:
            print(f"Error stopping polling thread after exception: {join_e}")

finally:
    # Clean up GPU memory (No NVML Shutdown)
    print("Starting cleanup for FP4 (FP16 Compute) ARC cell...") # <<< Log updated
    if 'model_fp4' in locals() and model_fp4 is not None: # Check for model_fp4 <<< Use renamed variable
        del model_fp4 # Delete model_fp4 <<< Use renamed variable
        print("FP4 model deleted.")
    # --- Removed deletion of separate model_fp16 ---

    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared and garbage collected after FP4 (FP16 Compute) ARC run.")

Configuring FP4 quantization (FP16 Compute)...
Quantization config created.
Loading model: Qwen/Qwen2.5-3B (FP4 Quantized, FP16 Compute)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.
Loading tokenizer...
Tokenizer loaded.
Starting ARC evaluation (FP4, FP16 Compute)...


Evaluating ARC:   0%|          | 0/200 [00:00<?, ?it/s]

ARC evaluation finished.

===== FP4 (FP16 Compute) ARC DETAILED MODEL METRICS =====
------------------------------------------------------------
PPL (Perplexity)                   : 8.3389
Accuracy                           : 84.0000
Memory Footprint (Model Size) (GB) : 1.8720
Inference Latency (ms/token)       : 193.9434
Avg GPU Utilization (%)            : 38.7292
Avg GPU Memory Allocated (GB)      : 1.9410
------------------------------------------------------------
Starting cleanup for FP4 (FP16 Compute) ARC cell...
FP4 model deleted.
GPU cache cleared and garbage collected after FP4 (FP16 Compute) ARC run.


In [None]:
# --- INT8 (FP32 Compute) ARC-easy Benchmark Evaluation Cell --- # <<< Title updated

# Assume 'handle' and 'stop_polling_event' exist from a previous cell.
if 'handle' not in locals() or not handle:
    print("Error: NVML handle not found. Please initialize NVML in a prior cell.")
    handle = None # Prevent polling
elif 'stop_polling_event' not in locals():
    print("Error: stop_polling_event not found. Please initialize in a prior cell.")
    # If needed, define fallback: stop_polling_event = threading.Event()

# Clear previous readings and reset event
gpu_utilization_readings = []
if 'stop_polling_event' in locals():
    stop_polling_event.clear()

# Start GPU polling thread
polling_thread = None
if handle:
    polling_thread = threading.Thread(target=poll_gpu_utilization, args=(handle, 1.0), daemon=True)
    polling_thread.start()

# Clean memory before starting
torch.cuda.empty_cache()
gc.collect()

int8_arc_metrics = {}          # <<< Variable renamed
int8_arc_results_data = {}     # <<< Variable renamed
model_int8 = None # Define outside try <<< Variable renamed

try:
    # --- INT8 Configuration Note ---
    # For BitsAndBytes INT8, we pass parameters directly to from_pretrained
    # No separate BitsAndBytesConfig object is typically used like for 4-bit.
    print("Configuring INT8 quantization (via load_in_8bit=True)...")

    # Load quantized model (INT8)
    model_name = "Qwen/Qwen2.5-3B" # Base model name
    print(f"Loading model: {model_name} (INT8 Quantized)...") # <<< Log updated
    model_int8 = AutoModelForCausalLM.from_pretrained( # <<< Variable renamed
        model_name,
        load_in_8bit=True,       # <--- Key parameter for INT8
        device_map="cuda:0",
        trust_remote_code=True
    )
    print("Model loaded.")

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    # --- No reset_peak_memory_stats needed for this metric set ---

    # Run ARC evaluation using the evaluate_arc function
    print("Starting ARC evaluation (INT8, FP32 Compute)...") # <<< Log updated (Assuming FP32 compute default)
    # Assuming evaluate_arc can handle the quantized model and returns required keys
    # Using max_new_tokens=1 for consistency with the format
    int8_arc_results_data = evaluate_arc(model_int8, tokenizer, samples_200_arc_test, max_new_tokens=1) # <<< Variable renamed, max_new_tokens=1
    print("ARC evaluation finished.")

    # Stop polling thread and calculate average utilization
    avg_gpu_utilization = 'N/A'
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        polling_thread.join(timeout=5)
        if gpu_utilization_readings:
            avg_gpu_utilization = statistics.mean(gpu_utilization_readings)
        else:
             avg_gpu_utilization = 0 # Polling ran but got no readings
    elif handle:
        avg_gpu_utilization = 0 # Polling failed to start/run correctly

    # Collect metrics in the MMLU format using results from evaluate_arc
    accuracy_val = int8_arc_results_data.get('accuracy', 'N/A') # <<< Use renamed variable
    if isinstance(accuracy_val, (float, int)):
        accuracy_val *= 100 # Multiply only if it's a number

    int8_arc_metrics = { # <<< Variable renamed
        "PPL (Perplexity)": int8_arc_results_data.get('perplexity', 'N/A'),
        "Accuracy": accuracy_val,
        # get_memory_footprint might be less accurate for 8-bit via load_in_8bit
        # Consider alternative measurement if needed, but using standard call for consistency
        "Memory Footprint (Model Size) (GB)": model_int8.get_memory_footprint() / (1024 ** 3), # <<< Use renamed variable
        "Inference Latency (ms/token)": int8_arc_results_data.get('inference_latency', 'N/A'),
        "Avg GPU Utilization (%)": avg_gpu_utilization, # From polling
        "Avg GPU Memory Allocated (GB)": int8_arc_results_data.get('avg_gpu_memory_gb', 'N/A'), # From evaluate_arc result
    }

    # Print metrics
    print("\n===== INT8 (FP32 Compute) ARC DETAILED MODEL METRICS =====") # <<< Title updated
    print("-" * 60)
    if int8_arc_metrics: # <<< Use renamed variable
        try:
            max_key_length = max(len(key) for key in int8_arc_metrics.keys()) # <<< Use renamed variable
        except ValueError:
             max_key_length = 35 # Default width

        for key, value in int8_arc_metrics.items(): # <<< Use renamed variable
             if isinstance(value, (float, int)):
                 print(f"{key.ljust(max_key_length)} : {value:.4f}")
             else:
                 print(f"{key.ljust(max_key_length)} : {value}") # Handles 'N/A'
    else:
        print("No metrics collected.")
    print("-" * 60)

except pynvml.NVMLError as nvml_error:
     print(f"NVML Error during INT8 (FP32 Compute) ARC evaluation steps: {nvml_error}") # <<< Log updated
except Exception as e:
    print(f"INT8 (FP32 Compute) ARC Evaluation error: {e}") # <<< Log updated
    # Ensure polling stops on error
    # --- Removed traceback import and print ---
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        try:
            polling_thread.join(timeout=5)
        except Exception as join_e:
            print(f"Error stopping polling thread after exception: {join_e}")

finally:
    # Clean up GPU memory (No NVML Shutdown)
    print("Starting cleanup for INT8 (FP32 Compute) ARC cell...") # <<< Log updated
    if 'model_int8' in locals() and model_int8 is not None: # Check for model_int8 <<< Use renamed variable
        del model_int8 # Delete model_int8 <<< Use renamed variable
        print("INT8 model deleted.")

    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared and garbage collected after INT8 (FP32 Compute) ARC run.")

Configuring INT8 quantization (via load_in_8bit=True)...
Loading model: Qwen/Qwen2.5-3B (INT8 Quantized)...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.
Loading tokenizer...
Tokenizer loaded.
Starting ARC evaluation (INT8, FP32 Compute)...


Evaluating ARC:   0%|          | 0/200 [00:00<?, ?it/s]

ARC evaluation finished.

===== INT8 (FP32 Compute) ARC DETAILED MODEL METRICS =====
------------------------------------------------------------
PPL (Perplexity)                   : 6.4720
Accuracy                           : 89.5000
Memory Footprint (Model Size) (GB) : 3.1640
Inference Latency (ms/token)       : 397.7153
Avg GPU Utilization (%)            : 18.9773
Avg GPU Memory Allocated (GB)      : 3.2701
------------------------------------------------------------
Starting cleanup for INT8 (FP32 Compute) ARC cell...
INT8 model deleted.
GPU cache cleared and garbage collected after INT8 (FP32 Compute) ARC run.


In [None]:
# --- INT8 (FP16 Compute) ARC-easy Benchmark Evaluation Cell --- # <<< Title updated

# Assume 'handle' and 'stop_polling_event' exist from a previous cell.
if 'handle' not in locals() or not handle:
    print("Error: NVML handle not found. Please initialize NVML in a prior cell.")
    handle = None # Prevent polling
elif 'stop_polling_event' not in locals():
    print("Error: stop_polling_event not found. Please initialize in a prior cell.")
    # If needed, define fallback: stop_polling_event = threading.Event()

# Clear previous readings and reset event
gpu_utilization_readings = []
if 'stop_polling_event' in locals():
    stop_polling_event.clear()

# Start GPU polling thread
polling_thread = None
if handle:
    polling_thread = threading.Thread(target=poll_gpu_utilization, args=(handle, 1.0), daemon=True)
    polling_thread.start()

# Clean memory before starting
torch.cuda.empty_cache()
gc.collect()

int8_arc_metrics = {}          # <<< Variable renamed
int8_arc_results_data = {}     # <<< Variable renamed
model_int8 = None # Define outside try <<< Variable renamed

try:
    # Configure quantization to INT8 with FP16 compute
    print("Configuring INT8 quantization (FP16 Compute)...") # <<< Log updated
    # Note: For INT8 with a specific compute dtype (like FP16),
    # we *do* use BitsAndBytesConfig unlike the simple load_in_8bit case.
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        # bnb_8bit_quant_type="int8", # Often implicit or not needed when load_in_8bit=True
        bnb_8bit_compute_dtype=torch.float16, # <--- Specify FP16 compute
        # bnb_8bit_use_double_quant might not be applicable/needed for int8
    )
    print("Quantization config created.")


    # Load quantized model (INT8)
    model_name = "Qwen/Qwen2.5-3B" # Base model name
    print(f"Loading model: {model_name} (INT8 Quantized, FP16 Compute)...") # <<< Log updated
    model_int8 = AutoModelForCausalLM.from_pretrained( # <<< Variable renamed
        model_name,
        quantization_config=bnb_config, # <--- Pass the config for FP16 compute
        device_map="cuda:0",
        trust_remote_code=True
    )
    print("Model loaded.")

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    # --- No reset_peak_memory_stats needed for this metric set ---

    # Run ARC evaluation using the evaluate_arc function
    print("Starting ARC evaluation (INT8, FP16 Compute)...") # <<< Log updated
    # Assuming evaluate_arc can handle the quantized model and returns required keys
    # Using max_new_tokens=1 for consistency with the format
    int8_arc_results_data = evaluate_arc(model_int8, tokenizer, samples_200_arc_test, max_new_tokens=1) # <<< Variable renamed, max_new_tokens=1
    print("ARC evaluation finished.")

    # Stop polling thread and calculate average utilization
    avg_gpu_utilization = 'N/A'
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        polling_thread.join(timeout=5)
        if gpu_utilization_readings:
            avg_gpu_utilization = statistics.mean(gpu_utilization_readings)
        else:
             avg_gpu_utilization = 0 # Polling ran but got no readings
    elif handle:
        avg_gpu_utilization = 0 # Polling failed to start/run correctly

    # Collect metrics in the MMLU format using results from evaluate_arc
    accuracy_val = int8_arc_results_data.get('accuracy', 'N/A') # <<< Use renamed variable
    if isinstance(accuracy_val, (float, int)):
        accuracy_val *= 100 # Multiply only if it's a number

    int8_arc_metrics = { # <<< Variable renamed
        "PPL (Perplexity)": int8_arc_results_data.get('perplexity', 'N/A'),
        "Accuracy": accuracy_val,
        "Memory Footprint (Model Size) (GB)": model_int8.get_memory_footprint() / (1024 ** 3), # <<< Use renamed variable
        "Inference Latency (ms/token)": int8_arc_results_data.get('inference_latency', 'N/A'),
        "Avg GPU Utilization (%)": avg_gpu_utilization, # From polling
        "Avg GPU Memory Allocated (GB)": int8_arc_results_data.get('avg_gpu_memory_gb', 'N/A'), # From evaluate_arc result
    }

    # Print metrics
    print("\n===== INT8 (FP16 Compute) ARC DETAILED MODEL METRICS =====") # <<< Title updated
    print("-" * 60)
    if int8_arc_metrics: # <<< Use renamed variable
        try:
            max_key_length = max(len(key) for key in int8_arc_metrics.keys()) # <<< Use renamed variable
        except ValueError:
             max_key_length = 35 # Default width

        for key, value in int8_arc_metrics.items(): # <<< Use renamed variable
             if isinstance(value, (float, int)):
                 print(f"{key.ljust(max_key_length)} : {value:.4f}")
             else:
                 print(f"{key.ljust(max_key_length)} : {value}") # Handles 'N/A'
    else:
        print("No metrics collected.")
    print("-" * 60)

except pynvml.NVMLError as nvml_error:
     print(f"NVML Error during INT8 (FP16 Compute) ARC evaluation steps: {nvml_error}") # <<< Log updated
except Exception as e:
    print(f"INT8 (FP16 Compute) ARC Evaluation error: {e}") # <<< Log updated
    # Ensure polling stops on error
    # --- Removed traceback import and print ---
    if polling_thread and polling_thread.is_alive():
        stop_polling_event.set()
        try:
            polling_thread.join(timeout=5)
        except Exception as join_e:
            print(f"Error stopping polling thread after exception: {join_e}")

finally:
    # Clean up GPU memory (No NVML Shutdown)
    print("Starting cleanup for INT8 (FP16 Compute) ARC cell...") # <<< Log updated
    if 'model_int8' in locals() and model_int8 is not None: # Check for model_int8 <<< Use renamed variable
        del model_int8 # Delete model_int8 <<< Use renamed variable
        print("INT8 model deleted.")

    torch.cuda.empty_cache()
    gc.collect()
    print("GPU cache cleared and garbage collected after INT8 (FP16 Compute) ARC run.")

Configuring INT8 quantization (FP16 Compute)...
Quantization config created.
Loading model: Qwen/Qwen2.5-3B (INT8 Quantized, FP16 Compute)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded.
Loading tokenizer...
Tokenizer loaded.
Starting ARC evaluation (INT8, FP16 Compute)...


Evaluating ARC:   0%|          | 0/200 [00:00<?, ?it/s]

ARC evaluation finished.

===== INT8 (FP16 Compute) ARC DETAILED MODEL METRICS =====
------------------------------------------------------------
PPL (Perplexity)                   : 6.4720
Accuracy                           : 89.5000
Memory Footprint (Model Size) (GB) : 3.1640
Inference Latency (ms/token)       : 398.0718
Avg GPU Utilization (%)            : 19.0000
Avg GPU Memory Allocated (GB)      : 3.2321
------------------------------------------------------------
Starting cleanup for INT8 (FP16 Compute) ARC cell...
INT8 model deleted.
GPU cache cleared and garbage collected after INT8 (FP16 Compute) ARC run.
