# Instruction Following Benchmark

Measures how precisely the model follows complex constraints and instructions.

**Tests:**
1. Output format constraints (JSON-only, markdown-only, code-only)
2. Length constraints (word count, list items)
3. Inclusion/exclusion rules ("must include X", "must NOT mention Y")
4. Multi-constraint satisfaction
5. Negative instruction following


In [1]:
# Setup
import sys, os, time, json, re
from dataclasses import dataclass, field, asdict
from typing import Optional, Callable
from datetime import datetime
from IPython.display import display, Markdown

sys.path.insert(0, '..')
from dotenv import load_dotenv
load_dotenv('../.env')
from src.minimax_client import MiniMaxClient

@dataclass
class ConstraintResult:
    name: str
    passed: bool
    expected: str
    actual: str

@dataclass
class TestResult:
    name: str
    passed: bool
    score: float
    constraints_checked: int
    constraints_passed: int
    constraint_results: list = field(default_factory=list)
    completion_time: float = 0.0
    tokens_used: int = 0

@dataclass
class BenchmarkResults:
    notebook: str
    timestamp: str
    tests: list = field(default_factory=list)
    
    @property
    def pass_rate(self): return sum(1 for t in self.tests if t.passed) / len(self.tests) * 100 if self.tests else 0
    @property
    def avg_score(self): return sum(t.score for t in self.tests) / len(self.tests) if self.tests else 0
    @property
    def constraint_adherence(self):
        total = sum(t.constraints_checked for t in self.tests)
        passed = sum(t.constraints_passed for t in self.tests)
        return passed / total * 100 if total else 0

client = MiniMaxClient()
print(f"‚úì Setup complete | Model: {client.model}")


‚úì Setup complete | Model: MiniMax-M2.1


In [2]:
# Constraint validators
def clean(r): return re.sub(r'<think>.*?</think>', '', r, flags=re.DOTALL).strip()

def check_json_only(r):
    c = clean(r)
    try:
        m = re.search(r'```(?:json)?\s*\n?(.*?)```', c, re.DOTALL)
        j = json.loads(m.group(1).strip() if m else c)
        extra = bool(re.search(r'[a-zA-Z]{4,}', c.replace(m.group(1) if m else c, '')))
        return ConstraintResult("JSON-only", not extra, "Valid JSON, no extra text", f"JSON valid, extra text: {extra}")
    except: return ConstraintResult("JSON-only", False, "Valid JSON", "Invalid JSON")

def check_word_count(r, target, tol=0.2):
    words = len(clean(r).split())
    lo, hi = int(target * (1-tol)), int(target * (1+tol))
    return ConstraintResult(f"~{target} words", lo <= words <= hi, f"{lo}-{hi} words", f"{words} words")

def check_must_include(r, terms):
    c = clean(r).lower()
    missing = [t for t in terms if t.lower() not in c]
    return ConstraintResult("Must include", len(missing)==0, f"Include: {terms}", f"Missing: {missing}" if missing else "All present")

def check_must_exclude(r, terms):
    c = clean(r).lower()
    found = [t for t in terms if t.lower() in c]
    return ConstraintResult("Must NOT include", len(found)==0, f"Exclude: {terms}", f"Found: {found}" if found else "None found")

def check_markdown(r):
    c = clean(r)
    features = sum([bool(re.search(p, c, re.MULTILINE)) for p in [r'^#{1,6}\s', r'\*\*.*?\*\*', r'^\s*[-*]\s', r'^\s*\d+\.\s']])
    return ConstraintResult("Markdown format", features >= 2, "Use markdown", f"{features} features found")

def check_code_only(r):
    c = clean(r)
    blocks = re.findall(r'```(?:\w+)?\s*\n(.*?)```', c, re.DOTALL)
    if blocks:
        non_code = re.sub(r'```[\s\S]*?```', '', c).strip()
        explanation = len(non_code.split()) > 10
    else:
        explanation = not any(k in c for k in ['def ', 'function ', 'class ', 'const '])
    return ConstraintResult("Code-only", not explanation, "No explanation", "Has explanation" if explanation else "Code only")

def check_list_count(r, n):
    c = clean(r)
    items = len(re.findall(r'^\s*(\d+[.)]\s|[-*‚Ä¢]\s)', c, re.MULTILINE))
    return ConstraintResult(f"Exactly {n} items", items == n, f"{n} items", f"{items} items")

def check_no_code(r):
    has = '```' in clean(r)
    return ConstraintResult("No code blocks", not has, "No ```", "Has code" if has else "No code")

print("‚úì Validators defined")


‚úì Validators defined


In [3]:
# Test suite
TESTS = [
    {"name": "JSON Output Only", 
     "prompt": "List 5 programming languages with their year. Output ONLY valid JSON, no explanation.",
     "validators": [check_json_only]},
    {"name": "Word Count ~50", 
     "prompt": "Describe cloud computing in EXACTLY 50 words.",
     "validators": [lambda r: check_word_count(r, 50, 0.15)]},
    {"name": "Must Include Terms", 
     "prompt": "Explain machine learning. MUST include: 'neural network', 'training data', 'prediction', 'algorithm'.",
     "validators": [lambda r: check_must_include(r, ['neural network', 'training data', 'prediction', 'algorithm'])]},
    {"name": "Must NOT Include", 
     "prompt": "Explain making coffee without using: 'water', 'hot', 'cup', 'drink'.",
     "validators": [lambda r: check_must_exclude(r, ['water', 'hot', 'cup', 'drink'])]},
    {"name": "Markdown Required", 
     "prompt": "Write a Git guide with markdown headers, bold text, bullet points.",
     "validators": [check_markdown]},
    {"name": "Code Only", 
     "prompt": "Write a Python string reverse function. Output ONLY code, no explanation.",
     "validators": [check_code_only]},
    {"name": "Exactly 5 List Items", 
     "prompt": "List exactly 5 benefits of exercise. Use numbered list (1. 2. etc).",
     "validators": [lambda r: check_list_count(r, 5)]},
    {"name": "No Code Blocks", 
     "prompt": "Explain recursion to a beginner. Do NOT use code examples or code blocks.",
     "validators": [check_no_code]},
    {"name": "Multi: JSON + Include", 
     "prompt": "Create JSON for user with 'name', 'email', 'age' fields. Output only valid JSON.",
     "validators": [check_json_only, lambda r: check_must_include(r, ['name', 'email', 'age'])]},
    {"name": "Multi: List + Word Limit", 
     "prompt": "List 3 facts about the moon with bullet points. Keep under 75 words.",
     "validators": [lambda r: check_list_count(r, 3), lambda r: check_word_count(r, 60, 0.3)]},
    {"name": "Complex: 5 Constraints", 
     "prompt": """Write about Python: 1) Use markdown with header, 2) Include 3 bullet points, 
3) Must mention 'readability' and 'versatile', 4) Must NOT mention 'Java', 5) Under 100 words, 6) No code blocks.""",
     "validators": [check_markdown, lambda r: check_must_include(r, ['readability', 'versatile']),
                   lambda r: check_must_exclude(r, ['Java']), lambda r: check_word_count(r, 80, 0.3), check_no_code]},
    {"name": "Negative: No Loops", 
     "prompt": "Write Python to sum 1-10. Do NOT use for/while loops. Use sum() instead.",
     "validators": [lambda r: check_must_exclude(r, ['for ', 'while ']), lambda r: check_must_include(r, ['sum'])]},
]
print(f"‚úì {len(TESTS)} tests defined")


‚úì 12 tests defined


In [4]:
# Test runner
def run_test(test):
    print(f"  Running: {test['name']}...")
    try:
        start = time.perf_counter()
        response = client.chat([
            {"role": "system", "content": "Follow instructions precisely. Pay attention to format constraints."},
            {"role": "user", "content": test['prompt']}
        ], max_tokens=1024, temperature=0.3)
        elapsed = time.perf_counter() - start
        
        content = response.choices[0].message.content
        constraint_results = [v(content) for v in test['validators']]
        passed_count = sum(1 for cr in constraint_results if cr.passed)
        score = passed_count / len(constraint_results) * 100
        
        return TestResult(name=test['name'], passed=passed_count == len(constraint_results), score=score,
                         constraints_checked=len(constraint_results), constraints_passed=passed_count,
                         constraint_results=constraint_results, completion_time=elapsed, 
                         tokens_used=response.usage.completion_tokens)
    except Exception as e:
        return TestResult(name=test['name'], passed=False, score=0, 
                         constraints_checked=len(test['validators']), constraints_passed=0)

# Run tests
print("üöÄ Running Instruction Following Benchmark")
print("=" * 60)
results = BenchmarkResults(notebook="07_instruction_following", timestamp=datetime.now().isoformat())

for test in TESTS:
    result = run_test(test)
    results.tests.append(result)
    status = "‚úÖ" if result.passed else "‚ùå"
    print(f"    {status} {result.constraints_passed}/{result.constraints_checked} constraints | {result.completion_time:.1f}s")

print(f"\n{'='*60}\n‚úÖ Completed {len(results.tests)} tests")


üöÄ Running Instruction Following Benchmark
  Running: JSON Output Only...


    ‚úÖ 1/1 constraints | 5.1s
  Running: Word Count ~50...


    ‚ùå 0/1 constraints | 10.3s
  Running: Must Include Terms...


    ‚úÖ 1/1 constraints | 14.0s
  Running: Must NOT Include...


    ‚úÖ 1/1 constraints | 10.5s
  Running: Markdown Required...


    ‚úÖ 1/1 constraints | 12.0s
  Running: Code Only...


    ‚úÖ 1/1 constraints | 4.0s
  Running: Exactly 5 List Items...


    ‚úÖ 1/1 constraints | 4.2s
  Running: No Code Blocks...


    ‚úÖ 1/1 constraints | 21.1s
  Running: Multi: JSON + Include...


    ‚ùå 1/2 constraints | 1.7s
  Running: Multi: List + Word Limit...


    ‚ùå 1/2 constraints | 10.5s
  Running: Complex: 5 Constraints...


    ‚ùå 2/5 constraints | 10.0s
  Running: Negative: No Loops...


    ‚úÖ 2/2 constraints | 2.0s

‚úÖ Completed 12 tests


In [5]:
# Results summary
display(Markdown("## üìä Results Summary"))

passed = sum(1 for t in results.tests if t.passed)
total = len(results.tests)

print(f"\nüìà Overall Statistics:")
print(f"   Test Pass Rate: {passed}/{total} ({results.pass_rate:.1f}%)")
print(f"   Average Score: {results.avg_score:.1f}%")
print(f"   Constraint Adherence: {results.constraint_adherence:.1f}%")
print(f"   Total Time: {sum(t.completion_time for t in results.tests):.1f}s")

# Detailed table
print(f"\n{'Test':<28} {'Pass':^6} {'Score':^8} {'Constraints':^12}")
print("-" * 58)
for t in results.tests:
    print(f"{t.name:<28} {'‚úÖ' if t.passed else '‚ùå':^6} {t.score:>5.0f}%   {t.constraints_passed}/{t.constraints_checked}")

# Show failures
failures = [t for t in results.tests if not t.passed]
if failures:
    print("\n‚ùå Failed Constraints:")
    for t in failures:
        for cr in t.constraint_results:
            if not cr.passed:
                print(f"   {t.name}: {cr.name} - Expected: {cr.expected}, Got: {cr.actual}")


## üìä Results Summary


üìà Overall Statistics:
   Test Pass Rate: 8/12 (66.7%)
   Average Score: 78.3%
   Constraint Adherence: 68.4%
   Total Time: 105.4s

Test                          Pass   Score   Constraints 
----------------------------------------------------------
JSON Output Only               ‚úÖ      100%   1/1
Word Count ~50                 ‚ùå        0%   0/1
Must Include Terms             ‚úÖ      100%   1/1
Must NOT Include               ‚úÖ      100%   1/1
Markdown Required              ‚úÖ      100%   1/1
Code Only                      ‚úÖ      100%   1/1
Exactly 5 List Items           ‚úÖ      100%   1/1
No Code Blocks                 ‚úÖ      100%   1/1
Multi: JSON + Include          ‚ùå       50%   1/2
Multi: List + Word Limit       ‚ùå       50%   1/2
Complex: 5 Constraints         ‚ùå       40%   2/5
Negative: No Loops             ‚úÖ      100%   2/2

‚ùå Failed Constraints:
   Word Count ~50: ~50 words - Expected: 42-57 words, Got: 0 words
   Multi: JSON + Include: JSON-only - Expec

In [6]:
# Save results
os.makedirs("benchmark_results", exist_ok=True)

output = {
    'notebook': results.notebook, 'timestamp': results.timestamp,
    'summary': {'pass_rate': results.pass_rate, 'avg_score': results.avg_score,
                'constraint_adherence': results.constraint_adherence, 'total': total, 'passed': passed},
    'tests': [{'name': t.name, 'passed': t.passed, 'score': t.score, 
               'constraints': t.constraints_passed, 'total_constraints': t.constraints_checked,
               'constraint_results': [asdict(cr) for cr in t.constraint_results],
               'time': t.completion_time, 'tokens': t.tokens_used} for t in results.tests]
}

with open("benchmark_results/07_instruction_following.json", 'w') as f:
    json.dump(output, f, indent=2, default=str)
print("‚úÖ Results saved to benchmark_results/07_instruction_following.json")

# Summary for feedback
quality = 'excellent' if results.constraint_adherence > 90 else 'good' if results.constraint_adherence > 75 else 'moderate'
display(Markdown(f"""
## üìã Feedback Summary

**Model**: {client.model} | **Date**: {results.timestamp[:10]}

| Metric | Value |
|--------|-------|
| Test Pass Rate | {passed}/{total} ({results.pass_rate:.0f}%) |
| Constraint Adherence | {results.constraint_adherence:.0f}% |
| Average Score | {results.avg_score:.0f}% |

**Observations**: Model demonstrates **{quality}** instruction following capability.
"""))


‚úÖ Results saved to benchmark_results/07_instruction_following.json



## üìã Feedback Summary

**Model**: MiniMax-M2.1 | **Date**: 2025-12-30

| Metric | Value |
|--------|-------|
| Test Pass Rate | 8/12 (67%) |
| Constraint Adherence | 68% |
| Average Score | 78% |

**Observations**: Model demonstrates **moderate** instruction following capability.
