# Evaluation

Auswertung der Runs - Durchschnittswerte über alle Runs pro Model+Strategy

In [71]:
import sys
sys.path.append('..')

import sqlite3
from survey import scoring
import pandas as pd

connection = sqlite3.connect('../data/survey.db')
cursor = connection.cursor()

In [72]:
# Alle verfügbaren Strategien ermitteln

cursor.execute("""
    SELECT DISTINCT s.name
    FROM runs r
    JOIN strategies s ON r.strategy_id = s.id
    ORDER BY s.name
""")

strategies = [row[0] for row in cursor.fetchall()]
print(f"Verfügbare Strategien: {strategies}")

Verfügbare Strategien: ['oneshot_llm_explicit', 'oneshot_llm_opinion', 'oneshot_none', 'oneshot_test', 'questionbyquestion_llm_explicit', 'questionbyquestion_llm_opinion', 'questionbyquestion_none', 'questionbyquestion_test']


In [73]:
# Evaluation für jede Strategie separat

for strategy_name in strategies:
    
    print(f"\n{'='*80}")
    print(f"STRATEGIE: {strategy_name}")
    print(f"{'='*80}\n")
    
    # Model-Kombinationen für diese Strategie
    cursor.execute("""
        SELECT 
            m.id as model_id,
            m.name as model_name,
            s.id as strategy_id,
            s.name as strategy_name,
            COUNT(DISTINCT r.id) as run_count
        FROM runs r
        JOIN models m ON r.model_id = m.id
        JOIN strategies s ON r.strategy_id = s.id
        WHERE s.name = ?
        GROUP BY m.id, s.id
        ORDER BY m.name
    """, (strategy_name,))
    
    combinations = cursor.fetchall()
    
    results = []
    
    for model_id, model_name, strategy_id, strategy_name_db, run_count in combinations:
        
        # Durchschnittliche Antworten über alle Runs
        cursor.execute("""
            SELECT 
                resp.question_id,
                AVG(resp.answer) as avg_answer
            FROM responses resp
            JOIN runs r ON resp.run_id = r.id
            WHERE r.model_id = ? AND r.strategy_id = ?
            GROUP BY resp.question_id
            ORDER BY resp.question_id
        """, (model_id, strategy_id))
        
        rows = cursor.fetchall()
        responses = {q_id: avg_answer for q_id, avg_answer in rows}
        
        # Scoring
        all_scores = scoring.calculate_all_milieu_scores(responses)
        result = scoring.get_primary_milieu(all_scores)
        
        results.append({
            'Model': model_name,
            'Runs': run_count,
            'Primary Milieu': result['primary_milieu'],
            'Probability': f"{result['primary_probability']:.3f}" if result['primary_probability'] != '---' else '---',
            'Secondary Milieu': result['secondary_milieu'],
            'Sec. Prob.': f"{result['secondary_probability']:.3f}" if result['secondary_probability'] != '---' else '---',
            'Unclear Candidates': result['candidates'],
            'Unclear Probabilities': result['probabilities'],
            'Confidence': result['confidence']
        })
    
    # Output Tabelle
    df = pd.DataFrame(results)
    print(df.to_string(index=False))
    print()


STRATEGIE: oneshot_llm_explicit

            Model  Runs       Primary Milieu Probability Secondary Milieu Sec. Prob.              Unclear Candidates                    Unclear Probabilities Confidence
   Claude Haikaku     4 Adaptiv-Pragmatische       0.562          Prekäre      0.454                             ---                                      ---   moderate
      Claude Opus     4 Adaptiv-Pragmatische       0.659              ---        ---                             ---                                      ---       high
    Deepseek Chat     4                  ---         ---              ---        --- [Adaptiv-Pragmatische, Prekäre] [0.5043603545751304, 0.4869821093076748]        low
Deepseek Reasoner     4 Adaptiv-Pragmatische       0.681          Prekäre      0.590                             ---                                      ---   moderate
          GPT-5.2     4                  ---         ---              ---        --- [Adaptiv-Pragmatische, Prekäre] [0.4

## Sinus-Milieus: Kurzbeschreibungen

### Obere Schicht / Liberal-Intellektuell
- **Konservativ-Etablierte**: Klassisches Establishment, Exklusivität, Statusbewusstsein, Verantwortungsethik
- **Postmaterielle**: Bildungselite, intellektuell, Umweltbewusstsein, Kulturorientierung
- **Performer**: Effizienzorientiert, technokratisch, Leistung, globalwirtschaftlich

### Moderne Mitte
- **Expeditive**: Kreativ, urban, Digital Natives, Trendsetter, kosmopolitisch
- **Neo-Ökologische**: Transformations-Treiber, optimistischer Environmentalismus, systemischer Wandel
- **Adaptiv-Pragmatische**: Moderne Mitte, flexibel, pragmatisch, moderate Werte **OHNE Extreme**

### Untere Mitte / Unterschicht
- **Konsum-Hedonistische**: Unterhaltung, Fun, Shopping, Markenbewusst, Work-Life-Balance
- **Prekäre**: Ökonomische Restriktionen, Anerkennungssuche, Teilhabe-Wunsch
- **Nostalgisch-Bürgerliche**: Harmonie, Sicherheit, Gemeinschaft, Stabilität, traditionell
- **Traditionelle**: Ältere Generation, kleinbürgerlich, Ordnung, Sicherheit, Anti-Modern