# Code Correctness Validation

This notebook validates that generated code is syntactically correct and follows best practices.

**Tests:**
1. Python syntax validation via `ast.parse()`
2. TypeScript syntax validation via `tsc` or heuristics
3. JSON schema compliance
4. Code complexity metrics (if radon installed)
5. Multi-language generation test (Python, TypeScript, JSON, SQL)


In [1]:
# Setup and imports
import sys
import os
import time
import json
import ast
import re
import tempfile
import subprocess
from dataclasses import dataclass, field, asdict
from typing import Optional, Any
from datetime import datetime
from IPython.display import display, Markdown

sys.path.insert(0, '..')
from dotenv import load_dotenv
load_dotenv('../.env')

from src.minimax_client import MiniMaxClient

# Optional: radon for complexity metrics
try:
    from radon.complexity import cc_visit
    from radon.metrics import mi_visit
    HAS_RADON = True
except ImportError:
    HAS_RADON = False
    print("‚ö†Ô∏è radon not installed - pip install radon for complexity metrics")

# Optional: pyflakes for lint checking
try:
    from pyflakes import api as pyflakes_api
    from pyflakes import reporter as pyflakes_reporter
    HAS_PYFLAKES = True
except ImportError:
    HAS_PYFLAKES = False
    print("‚ö†Ô∏è pyflakes not installed - pip install pyflakes for lint checking")

print(f"‚úì Setup complete | radon: {HAS_RADON} | pyflakes: {HAS_PYFLAKES}")


‚úì Setup complete | radon: True | pyflakes: True


In [2]:
# Data structures
@dataclass
class ValidationResult:
    valid: bool
    language: str
    errors: list = field(default_factory=list)
    warnings: list = field(default_factory=list)
    metrics: dict = field(default_factory=dict)

@dataclass 
class TestResult:
    name: str
    passed: bool
    score: float
    details: dict = field(default_factory=dict)
    validation: Optional[ValidationResult] = None
    completion_time: float = 0.0
    tokens_used: int = 0

@dataclass
class BenchmarkResults:
    notebook: str
    timestamp: str
    tests: list = field(default_factory=list)
    
    @property
    def pass_rate(self) -> float:
        return sum(1 for t in self.tests if t.passed) / len(self.tests) * 100 if self.tests else 0
    
    @property
    def avg_score(self) -> float:
        return sum(t.score for t in self.tests) / len(self.tests) if self.tests else 0

# Initialize client
client = MiniMaxClient()
print(f"‚úì Client initialized | Model: {client.model}")


‚úì Client initialized | Model: MiniMax-M2.1


In [3]:
# Validation functions
def extract_code(response: str, lang: str = None) -> str:
    """Extract code from model response."""
    response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
    pattern = rf'```{lang}\s*\n(.*?)```' if lang else r'```(?:\w+)?\s*\n(.*?)```'
    matches = re.findall(pattern, response, re.DOTALL | re.IGNORECASE)
    return matches[0].strip() if matches else response.strip()

def validate_python(code: str) -> ValidationResult:
    """Validate Python syntax."""
    errors, warnings, metrics = [], [], {'lines': len(code.split('\n')), 'chars': len(code)}
    try:
        ast.parse(code)
        valid = True
    except SyntaxError as e:
        valid = False
        errors.append(f"Line {e.lineno}: {e.msg}")
    
    if HAS_PYFLAKES and valid:
        import io
        w, e = io.StringIO(), io.StringIO()
        pyflakes_api.check(code, '<gen>', pyflakes_reporter.Reporter(w, e))
        if w.getvalue(): warnings.extend(w.getvalue().strip().split('\n'))
        if e.getvalue(): errors.extend(e.getvalue().strip().split('\n'))
    
    if HAS_RADON and valid:
        try:
            cc = cc_visit(code)
            if cc:
                metrics['avg_complexity'] = round(sum(c.complexity for c in cc) / len(cc), 2)
                metrics['max_complexity'] = max(c.complexity for c in cc)
            metrics['maintainability'] = round(mi_visit(code, True), 2)
        except: pass
    
    return ValidationResult(valid=valid, language='python', errors=errors, warnings=warnings, metrics=metrics)

def validate_typescript(code: str) -> ValidationResult:
    """Validate TypeScript syntax."""
    errors, warnings, metrics = [], [], {'lines': len(code.split('\n')), 'chars': len(code)}
    
    # Basic bracket balance check
    valid = True
    for open_b, close_b, name in [('{', '}', 'braces'), ('(', ')', 'parens'), ('[', ']', 'brackets')]:
        diff = code.count(open_b) - code.count(close_b)
        if diff != 0:
            errors.append(f"Unbalanced {name}: {diff:+d}")
            valid = False
    
    metrics['has_types'] = bool(re.search(r':\s*(string|number|boolean|any|\w+\[\])', code))
    metrics['has_interfaces'] = 'interface ' in code
    metrics['uses_any'] = bool(re.search(r':\s*any\b', code))
    
    return ValidationResult(valid=valid, language='typescript', errors=errors, warnings=warnings, metrics=metrics)

def validate_json(code: str) -> ValidationResult:
    """Validate JSON syntax."""
    errors, metrics = [], {'chars': len(code)}
    try:
        parsed = json.loads(code)
        valid = True
        metrics['type'] = 'object' if isinstance(parsed, dict) else 'array' if isinstance(parsed, list) else type(parsed).__name__
        metrics['size'] = len(parsed) if isinstance(parsed, (dict, list)) else 1
    except json.JSONDecodeError as e:
        valid = False
        errors.append(f"Line {e.lineno}: {e.msg}")
    return ValidationResult(valid=valid, language='json', errors=errors, metrics=metrics)

def validate_sql(code: str) -> ValidationResult:
    """Validate SQL syntax."""
    errors, warnings, metrics = [], [], {'lines': len(code.split('\n'))}
    sql_kw = ['SELECT', 'INSERT', 'UPDATE', 'DELETE', 'CREATE', 'FROM', 'WHERE']
    upper = code.upper()
    
    valid = any(kw in upper for kw in sql_kw)
    if not valid: errors.append("No SQL keywords found")
    if code.count('(') != code.count(')'): 
        errors.append("Unbalanced parentheses")
        valid = False
    if 'SELECT *' in upper: warnings.append("Uses SELECT *")
    
    metrics['has_where'] = 'WHERE' in upper
    metrics['has_join'] = 'JOIN' in upper
    return ValidationResult(valid=valid, language='sql', errors=errors, warnings=warnings, metrics=metrics)

print("‚úì Validation functions defined")


‚úì Validation functions defined


In [4]:
# Test suite
TESTS = [
    {"name": "Python - Binary Search", "lang": "python", "validator": validate_python,
     "prompt": "Write a Python binary search function with type hints and docstring. Output ONLY code.",
     "requirements": ["def ", "return"]},
    {"name": "Python - Async HTTP", "lang": "python", "validator": validate_python,
     "prompt": "Write a Python async function using aiohttp to fetch a URL with error handling. Output ONLY code.",
     "requirements": ["async def", "await"]},
    {"name": "Python - Dataclass", "lang": "python", "validator": validate_python,
     "prompt": "Write a Python dataclass User with id, name, email fields and an email validation method. Output ONLY code.",
     "requirements": ["@dataclass", "class User"]},
    {"name": "TypeScript - React Component", "lang": "typescript", "validator": validate_typescript,
     "prompt": "Write a TypeScript React Button component with label, onClick, disabled props. Use proper types. Output ONLY code.",
     "requirements": ["interface", "React"]},
    {"name": "TypeScript - API Service", "lang": "typescript", "validator": validate_typescript,
     "prompt": "Write a TypeScript ApiService class with get<T>, post<T>, delete methods using generics. Output ONLY code.",
     "requirements": ["class", "async", "<T>"]},
    {"name": "TypeScript - Custom Hook", "lang": "typescript", "validator": validate_typescript,
     "prompt": "Write a TypeScript useLocalStorage hook that syncs state with localStorage. Output ONLY code.",
     "requirements": ["useLocalStorage", "useState"]},
    {"name": "JSON - User Profile", "lang": "json", "validator": validate_json,
     "prompt": "Generate a JSON user profile with id, username, email, profile object (bio, avatar), created_at. Output ONLY JSON.",
     "requirements": ["id", "username", "profile"]},
    {"name": "JSON - Config File", "lang": "json", "validator": validate_json,
     "prompt": "Generate a JSON config with database (host, port), cache (ttl), feature_flags array. Output ONLY JSON.",
     "requirements": ["database", "cache"]},
    {"name": "SQL - Top Customers", "lang": "sql", "validator": validate_sql,
     "prompt": "Write SQL to find top 10 customers by order value with JOINs. Output ONLY SQL.",
     "requirements": ["SELECT", "JOIN", "ORDER BY"]},
    {"name": "SQL - Create Table", "lang": "sql", "validator": validate_sql,
     "prompt": "Write SQL to create products table with id, name, price, category_id FK. Output ONLY SQL.",
     "requirements": ["CREATE TABLE", "PRIMARY KEY"]},
]
print(f"‚úì {len(TESTS)} tests defined")


‚úì 10 tests defined


In [5]:
# Test runner
def run_test(test: dict) -> TestResult:
    print(f"  Running: {test['name']}...")
    try:
        start = time.perf_counter()
        response = client.chat([
            {"role": "system", "content": f"You are an expert {test['lang']} developer. Output ONLY code."},
            {"role": "user", "content": test['prompt']}
        ], max_tokens=2048, temperature=0.3)
        elapsed = time.perf_counter() - start
        
        content = response.choices[0].message.content
        code = extract_code(content, test['lang'])
        validation = test['validator'](code)
        
        # Check requirements
        req_met = sum(1 for r in test['requirements'] if r.lower() in code.lower())
        req_score = req_met / len(test['requirements']) * 100
        
        # Score: 50% syntax, 50% requirements, penalty for errors
        score = max(0, (100 if validation.valid else 0) * 0.5 + req_score * 0.5 - len(validation.errors) * 10)
        passed = validation.valid and req_score >= 75
        
        return TestResult(name=test['name'], passed=passed, score=round(score, 1),
            details={'req_met': f"{req_met}/{len(test['requirements'])}", 'errors': len(validation.errors)},
            validation=validation, completion_time=elapsed, tokens_used=response.usage.completion_tokens)
    except Exception as e:
        return TestResult(name=test['name'], passed=False, score=0, details={'error': str(e)})

# Run tests
print("üöÄ Running Code Correctness Tests")
print("=" * 60)
results = BenchmarkResults(notebook="06_code_correctness", timestamp=datetime.now().isoformat())

for test in TESTS:
    result = run_test(test)
    results.tests.append(result)
    status = "‚úÖ" if result.passed else "‚ùå"
    print(f"    {status} Score: {result.score}/100 | {result.completion_time:.1f}s")

print(f"\n{'='*60}\n‚úÖ Completed {len(results.tests)} tests")


üöÄ Running Code Correctness Tests
  Running: Python - Binary Search...


    ‚úÖ Score: 100.0/100 | 4.8s
  Running: Python - Async HTTP...


    ‚úÖ Score: 100.0/100 | 5.0s
  Running: Python - Dataclass...


    ‚ùå Score: 50.0/100 | 21.4s
  Running: TypeScript - React Component...


    ‚úÖ Score: 100.0/100 | 10.9s
  Running: TypeScript - API Service...


    ‚úÖ Score: 100.0/100 | 21.6s
  Running: TypeScript - Custom Hook...


    ‚úÖ Score: 100.0/100 | 15.5s
  Running: JSON - User Profile...


    ‚úÖ Score: 100.0/100 | 3.7s
  Running: JSON - Config File...


    ‚úÖ Score: 100.0/100 | 3.4s
  Running: SQL - Top Customers...


    ‚úÖ Score: 100.0/100 | 4.6s
  Running: SQL - Create Table...


    ‚úÖ Score: 100.0/100 | 13.4s

‚úÖ Completed 10 tests


In [6]:
# Results summary
display(Markdown("## üìä Results Summary"))

passed = sum(1 for t in results.tests if t.passed)
total = len(results.tests)
syntax_valid = sum(1 for t in results.tests if t.validation and t.validation.valid)

print(f"\nüìà Overall Statistics:")
print(f"   Pass Rate: {passed}/{total} ({results.pass_rate:.1f}%)")
print(f"   Syntax Valid: {syntax_valid}/{total} ({syntax_valid/total*100:.1f}%)")
print(f"   Average Score: {results.avg_score:.1f}/100")
print(f"   Total Time: {sum(t.completion_time for t in results.tests):.1f}s")

# By language
print(f"\nüìä By Language:")
langs = {}
for t in results.tests:
    lang = t.validation.language if t.validation else 'unknown'
    if lang not in langs: langs[lang] = {'passed': 0, 'total': 0, 'scores': []}
    langs[lang]['total'] += 1
    langs[lang]['scores'].append(t.score)
    if t.passed: langs[lang]['passed'] += 1

for lang, s in langs.items():
    print(f"   {lang.upper():12} | {s['passed']}/{s['total']} passed | Avg: {sum(s['scores'])/len(s['scores']):.0f}")

# Detailed table
print(f"\n{'Test':<30} {'Pass':^6} {'Score':^8} {'Errors':^8}")
print("-" * 55)
for t in results.tests:
    print(f"{t.name:<30} {'‚úÖ' if t.passed else '‚ùå':^6} {t.score:>5.0f}   {t.details.get('errors', 'N/A'):^8}")


## üìä Results Summary


üìà Overall Statistics:
   Pass Rate: 9/10 (90.0%)
   Syntax Valid: 10/10 (100.0%)
   Average Score: 95.0/100
   Total Time: 104.4s

üìä By Language:
   PYTHON       | 2/3 passed | Avg: 83
   TYPESCRIPT   | 3/3 passed | Avg: 100
   JSON         | 2/2 passed | Avg: 100
   SQL          | 2/2 passed | Avg: 100

Test                            Pass   Score    Errors 
-------------------------------------------------------
Python - Binary Search           ‚úÖ      100      0    
Python - Async HTTP              ‚úÖ      100      0    
Python - Dataclass               ‚ùå       50      0    
TypeScript - React Component     ‚úÖ      100      0    
TypeScript - API Service         ‚úÖ      100      0    
TypeScript - Custom Hook         ‚úÖ      100      0    
JSON - User Profile              ‚úÖ      100      0    
JSON - Config File               ‚úÖ      100      0    
SQL - Top Customers              ‚úÖ      100      0    
SQL - Create Table               ‚úÖ      100      0    


In [7]:
# Save results
os.makedirs("benchmark_results", exist_ok=True)

output = {
    'notebook': results.notebook, 'timestamp': results.timestamp,
    'summary': {'pass_rate': results.pass_rate, 'avg_score': results.avg_score,
                'syntax_valid_rate': syntax_valid/total*100, 'total': total, 'passed': passed},
    'by_language': {l: {'passed': s['passed'], 'total': s['total'], 'avg_score': sum(s['scores'])/len(s['scores'])} 
                   for l, s in langs.items()},
    'tests': [{'name': t.name, 'passed': t.passed, 'score': t.score, 'details': t.details,
               'validation': asdict(t.validation) if t.validation else None,
               'time': t.completion_time, 'tokens': t.tokens_used} for t in results.tests]
}

with open("benchmark_results/06_code_correctness.json", 'w') as f:
    json.dump(output, f, indent=2, default=str)
print("‚úÖ Results saved to benchmark_results/06_code_correctness.json")

# Summary for feedback
display(Markdown(f"""
## üìã Feedback Summary

**Model**: {client.model} | **Date**: {results.timestamp[:10]}

| Metric | Value |
|--------|-------|
| Syntax Validity | {syntax_valid}/{total} ({syntax_valid/total*100:.0f}%) |
| Pass Rate | {passed}/{total} ({results.pass_rate:.0f}%) |
| Average Score | {results.avg_score:.0f}/100 |

**Observations**: Model {'generates syntactically valid code consistently' if syntax_valid/total > 0.8 else 'has some syntax issues'}.
"""))


‚úÖ Results saved to benchmark_results/06_code_correctness.json



## üìã Feedback Summary

**Model**: MiniMax-M2.1 | **Date**: 2025-12-30

| Metric | Value |
|--------|-------|
| Syntax Validity | 10/10 (100%) |
| Pass Rate | 9/10 (90%) |
| Average Score | 95/100 |

**Observations**: Model generates syntactically valid code consistently.
