# Lab 3.2.7: Quality Benchmark Suite - Solutions

This notebook contains solutions for all exercises in Lab 3.2.7.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

## Exercise 1 Solution: Custom Perplexity Dataset

Create a perplexity evaluation using domain-specific data.

In [None]:
class DomainPerplexityEvaluator:
    """
    Evaluate perplexity on domain-specific text.
    """
    
    def __init__(self, domain_name: str):
        self.domain_name = domain_name
        self.texts = []
    
    def add_text(self, text: str):
        """Add a domain-specific text sample."""
        if len(text.strip()) > 50:  # Minimum length
            self.texts.append(text.strip())
    
    def add_texts_from_file(self, filepath: str):
        """Load texts from a file (one per line or paragraph)."""
        try:
            with open(filepath, 'r') as f:
                for line in f:
                    self.add_text(line)
        except FileNotFoundError:
            print(f"File not found: {filepath}")
    
    def calculate_perplexity_simulated(self, model_quality: float = 1.0) -> dict:
        """
        Simulate perplexity calculation.
        
        In real implementation, this would use actual model inference.
        
        Args:
            model_quality: Quality factor (1.0 = baseline, lower = worse)
            
        Returns:
            Perplexity statistics
        """
        if not self.texts:
            return {'mean': float('inf'), 'std': 0, 'samples': 0}
        
        # Simulate perplexity based on text complexity and model quality
        perplexities = []
        
        for text in self.texts:
            # Base perplexity from text complexity
            word_count = len(text.split())
            unique_words = len(set(text.lower().split()))
            complexity = unique_words / (word_count + 1)
            
            # Simulate perplexity (lower quality = higher perplexity)
            base_ppl = 5 + 10 * complexity
            ppl = base_ppl / model_quality + np.random.normal(0, 0.5)
            perplexities.append(max(1.0, ppl))
        
        return {
            'mean': np.mean(perplexities),
            'std': np.std(perplexities),
            'min': np.min(perplexities),
            'max': np.max(perplexities),
            'samples': len(perplexities)
        }


# Create domain-specific evaluator
print("Domain-Specific Perplexity Evaluation")
print("="*50)

# Example: Technical documentation domain
tech_evaluator = DomainPerplexityEvaluator("technical_docs")

# Add sample technical texts
tech_texts = [
    "The CUDA kernel is launched with a grid of thread blocks, where each block contains multiple threads that execute in parallel.",
    "Tensor parallelism distributes model layers across multiple GPUs, enabling training of models that exceed single GPU memory.",
    "Quantization reduces model precision from FP32 to INT8 or INT4, trading accuracy for inference speed and memory efficiency.",
    "The attention mechanism computes weighted sums of value vectors, with weights determined by query-key similarity scores.",
    "Gradient checkpointing trades compute for memory by recomputing activations during backpropagation instead of storing them.",
    "Flash attention uses tiling and recomputation to achieve O(N) memory complexity while maintaining exact attention computation.",
    "The KV-cache stores computed key and value tensors for autoregressive generation, avoiding redundant computation.",
    "Mixed precision training uses FP16 for forward and backward passes while maintaining FP32 master weights for stability.",
    "LoRA fine-tuning adds low-rank adaptation matrices to frozen pretrained weights, reducing trainable parameter count.",
    "The tokenizer converts text to token IDs using a learned vocabulary, typically based on BPE or SentencePiece.",
]

for text in tech_texts:
    tech_evaluator.add_text(text)

print(f"Domain: {tech_evaluator.domain_name}")
print(f"Samples: {len(tech_evaluator.texts)}")

# Evaluate with different model qualities (simulating quantization)
quality_levels = {
    'FP16 (baseline)': 1.0,
    'INT8': 0.98,
    'INT4 (NF4)': 0.92,
    'GPTQ-4bit': 0.95,
    'AWQ-4bit': 0.96,
    'FP8': 0.99,
    'FP4': 0.94,
}

print(f"\n{'Method':<20} {'Perplexity':<12} {'vs Baseline':<12}")
print("-"*44)

baseline_ppl = None
for method, quality in quality_levels.items():
    result = tech_evaluator.calculate_perplexity_simulated(quality)
    ppl = result['mean']
    
    if baseline_ppl is None:
        baseline_ppl = ppl
    
    ppl_increase = (ppl - baseline_ppl) / baseline_ppl * 100
    print(f"{method:<20} {ppl:<12.2f} +{ppl_increase:<10.1f}%")

## Exercise 2 Solution: Custom MMLU Subject

Create a custom knowledge evaluation for a specific subject.

In [None]:
class CustomKnowledgeEvaluator:
    """
    Custom knowledge evaluation with multiple-choice questions.
    """
    
    def __init__(self, subject_name: str):
        self.subject_name = subject_name
        self.questions = []
    
    def add_question(self, question: str, choices: list, correct_idx: int):
        """
        Add a multiple-choice question.
        
        Args:
            question: The question text
            choices: List of 4 answer choices
            correct_idx: Index of correct answer (0-3)
        """
        if len(choices) != 4:
            raise ValueError("Must have exactly 4 choices")
        if not 0 <= correct_idx <= 3:
            raise ValueError("correct_idx must be 0-3")
        
        self.questions.append({
            'question': question,
            'choices': choices,
            'answer': correct_idx
        })
    
    def format_question(self, q: dict) -> str:
        """Format question for model input."""
        formatted = f"Question: {q['question']}\n\n"
        for i, choice in enumerate(q['choices']):
            formatted += f"{chr(65+i)}. {choice}\n"
        formatted += "\nAnswer:"
        return formatted
    
    def evaluate_simulated(self, model_accuracy: float = 0.7) -> dict:
        """
        Simulate model evaluation.
        
        Args:
            model_accuracy: Expected accuracy (0-1)
            
        Returns:
            Evaluation results
        """
        if not self.questions:
            return {'accuracy': 0, 'correct': 0, 'total': 0}
        
        correct = 0
        results = []
        
        for q in self.questions:
            # Simulate model answer based on accuracy
            is_correct = np.random.random() < model_accuracy
            if is_correct:
                predicted = q['answer']
                correct += 1
            else:
                # Pick wrong answer
                wrong_choices = [i for i in range(4) if i != q['answer']]
                predicted = np.random.choice(wrong_choices)
            
            results.append({
                'question': q['question'][:50] + '...',
                'predicted': chr(65 + predicted),
                'correct': chr(65 + q['answer']),
                'is_correct': is_correct
            })
        
        return {
            'accuracy': correct / len(self.questions),
            'correct': correct,
            'total': len(self.questions),
            'results': results
        }


# Create custom ML knowledge evaluator
print("Custom Knowledge Evaluation: Machine Learning")
print("="*50)

ml_eval = CustomKnowledgeEvaluator("machine_learning")

# Add ML-focused questions
ml_eval.add_question(
    "What does the softmax function output?",
    ["Unbounded real numbers", "Probability distribution", "Binary values", "Negative numbers only"],
    1
)

ml_eval.add_question(
    "Which optimizer uses momentum and adaptive learning rates?",
    ["SGD", "Adam", "Gradient Descent", "L-BFGS"],
    1
)

ml_eval.add_question(
    "What is the purpose of dropout in neural networks?",
    ["Speed up training", "Reduce memory usage", "Prevent overfitting", "Increase model size"],
    2
)

ml_eval.add_question(
    "What does GPTQ stand for in model quantization?",
    ["General Purpose Tensor Quantization", "GPT Quantization", "Gradient-based Post-Training Quantization", "GPU Tensor Query"],
    2
)

ml_eval.add_question(
    "Which data type uses 4 bits for exponent and 3 bits for mantissa?",
    ["FP8 E5M2", "FP8 E4M3", "INT8", "BF16"],
    1
)

print(f"Subject: {ml_eval.subject_name}")
print(f"Questions: {len(ml_eval.questions)}")

# Evaluate with different quantization levels
quant_accuracies = {
    'FP16': 0.75,
    'INT8': 0.73,
    'INT4': 0.68,
    'GPTQ': 0.71,
    'AWQ': 0.72,
}

print(f"\n{'Method':<12} {'Accuracy':<12} {'Correct':<10}")
print("-"*34)

for method, acc in quant_accuracies.items():
    result = ml_eval.evaluate_simulated(acc)
    print(f"{method:<12} {result['accuracy']*100:<12.1f}% {result['correct']}/{result['total']}")

## Exercise 3 Solution: Statistical Significance Testing

Determine if differences between quantization methods are statistically significant.

In [None]:
def statistical_significance_test(
    baseline_samples: np.ndarray,
    quantized_samples: np.ndarray,
    alpha: float = 0.05
) -> dict:
    """
    Test statistical significance between baseline and quantized perplexity.
    
    Uses Welch's t-test (doesn't assume equal variances).
    
    Args:
        baseline_samples: Perplexity samples from baseline model
        quantized_samples: Perplexity samples from quantized model
        alpha: Significance level
        
    Returns:
        Test results
    """
    # Descriptive statistics
    baseline_mean = np.mean(baseline_samples)
    baseline_std = np.std(baseline_samples)
    quant_mean = np.mean(quantized_samples)
    quant_std = np.std(quantized_samples)
    
    # Effect size (Cohen's d)
    pooled_std = np.sqrt((baseline_std**2 + quant_std**2) / 2)
    cohens_d = (quant_mean - baseline_mean) / pooled_std if pooled_std > 0 else 0
    
    # Welch's t-test
    t_stat, p_value = stats.ttest_ind(baseline_samples, quantized_samples, equal_var=False)
    
    # Confidence interval for difference
    diff = quant_mean - baseline_mean
    se_diff = np.sqrt(baseline_std**2/len(baseline_samples) + quant_std**2/len(quantized_samples))
    ci_low = diff - 1.96 * se_diff
    ci_high = diff + 1.96 * se_diff
    
    # Interpretation
    is_significant = p_value < alpha
    
    if abs(cohens_d) < 0.2:
        effect_interpretation = "negligible"
    elif abs(cohens_d) < 0.5:
        effect_interpretation = "small"
    elif abs(cohens_d) < 0.8:
        effect_interpretation = "medium"
    else:
        effect_interpretation = "large"
    
    return {
        'baseline_mean': baseline_mean,
        'baseline_std': baseline_std,
        'quantized_mean': quant_mean,
        'quantized_std': quant_std,
        'difference': diff,
        'difference_percent': (diff / baseline_mean) * 100,
        'confidence_interval': (ci_low, ci_high),
        't_statistic': t_stat,
        'p_value': p_value,
        'cohens_d': cohens_d,
        'is_significant': is_significant,
        'effect_size': effect_interpretation
    }


# Simulate perplexity measurements
np.random.seed(42)

print("Statistical Significance Analysis")
print("="*60)

# Baseline (FP16) perplexity samples
n_samples = 100
baseline_ppl = np.random.normal(10.0, 0.5, n_samples)

# Different quantization methods
quant_methods = {
    'INT8': np.random.normal(10.2, 0.6, n_samples),
    'INT4 (NF4)': np.random.normal(10.8, 0.8, n_samples),
    'GPTQ-4bit': np.random.normal(10.5, 0.6, n_samples),
    'AWQ-4bit': np.random.normal(10.4, 0.5, n_samples),
    'FP8': np.random.normal(10.1, 0.5, n_samples),
}

print(f"Baseline (FP16): mean={np.mean(baseline_ppl):.2f}, std={np.std(baseline_ppl):.2f}")
print("\n" + "-"*60)

for method, samples in quant_methods.items():
    result = statistical_significance_test(baseline_ppl, samples)
    
    sig_marker = "*" if result['is_significant'] else ""
    
    print(f"\n{method}:")
    print(f"  Mean: {result['quantized_mean']:.2f} (Δ = +{result['difference']:.2f}, +{result['difference_percent']:.1f}%)")
    print(f"  95% CI: [{result['confidence_interval'][0]:.2f}, {result['confidence_interval'][1]:.2f}]")
    print(f"  p-value: {result['p_value']:.4f} {sig_marker}")
    print(f"  Cohen's d: {result['cohens_d']:.3f} ({result['effect_size']})")
    print(f"  Significant: {'Yes' if result['is_significant'] else 'No'} (α=0.05)")

print("\n" + "="*60)
print("* = statistically significant at α=0.05")
print("\nInterpretation:")
print("  - INT8 and FP8 show minimal quality degradation")
print("  - INT4 shows larger but potentially acceptable degradation")
print("  - AWQ and GPTQ help reduce INT4 quality loss")

## Summary

Key findings:

1. **Domain-specific evaluation** reveals different sensitivities to quantization
2. **Custom knowledge tests** can target specific model capabilities
3. **Statistical testing** helps distinguish meaningful differences from noise
4. **Effect size** (Cohen's d) provides practical interpretation of differences