# Paper 5: Can Perplexity Detect AI-Generated Text?

**Forschungsfrage**: Kann Perplexity als Metrik genutzt werden, um AI-generierten Text von humanem Text zu unterscheiden?

**Autoren**: Mimar Sinan Yildiz, [Kommilitone Name]

**Datum**: 2026-02-07

**Daten**: 30 Human-Texte + 30 AI-Texte

---

## üìã Notebook-Struktur
1. **Setup & Konfiguration**
2. **Daten laden & validieren**
3. **Perplexity berechnen**
4. **Experiment 1: Statistischer Verteilungsvergleich**
5. **Experiment 2: Klassifikations-Performance**
6. **Experiment 3: Error Analysis**
7. **Zusammenfassung & Export**



---

## 1. Setup & Konfiguration

Importiere alle ben√∂tigten Bibliotheken und setze Konstanten.

In [None]:
# ============================================================================
# IMPORTS & INSTALL
# ============================================================================

# Bibliotheken installieren
%pip install -r requirements.txt

# Standard-Bibliotheken
import json
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Dict, Tuple, Optional
import warnings

warnings.filterwarnings('ignore')

# PyTorch f√ºr GPT-2
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Statistische Analysen
from scipy import stats
from scipy.stats import ttest_ind

# Machine Learning Metriken
from sklearn.metrics import (
    roc_curve, auc, accuracy_score,
    precision_score, recall_score, f1_score,
    confusion_matrix
)

# Visualisierung
import matplotlib.pyplot as plt
import seaborn as sns

# Stil
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 11

print("‚úì Alle Bibliotheken erfolgreich importiert!")

In [None]:
# ============================================================================
# KONFIGURATION
# ============================================================================

# Pfade
DATA_DIR = Path("data")
RESULTS_DIR = Path("results")
FIGURES_DIR = Path("figures")

# Erstelle Ordner
RESULTS_DIR.mkdir(exist_ok=True)
FIGURES_DIR.mkdir(exist_ok=True)

# Dateipfade
HUMAN_TEXTS_FILE = DATA_DIR / "human_texts.json"
AI_TEXTS_FILE = DATA_DIR / "ai_texts.json"
COMBINED_CSV_FILE = DATA_DIR / "combined_data.csv"

# Modell-Konfiguration
MODEL_NAME = "gpt2"  # Alternative: "gpt2-medium"
MAX_TOKEN_LENGTH = 512

# Text-Validierung
MIN_WORD_COUNT = 50
MAX_WORD_COUNT = 150

# Reproduzierbarkeit
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# GPU/CPU
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("‚úì Konfiguration geladen")
print(f"  - Ger√§t: {DEVICE}")
print(f"  - Modell: {MODEL_NAME}")
print(f"  - Max Token Length: {MAX_TOKEN_LENGTH}")

---

## 2. Daten laden & validieren

Lade die JSON-Dateien und pr√ºfe Grundanforderungen (Wortanzahl, Felder).

In [None]:
def load_texts(path: Path) -> List[Dict]:
    data = json.loads(path.read_text(encoding="utf-8"))
    texts = data.get("texts", [])
    if not isinstance(texts, list):
        raise ValueError(f"Ung√ºltiges Format in {path}")
    return texts


def word_count(text: str) -> int:
    return len(text.strip().split())


human_texts = load_texts(HUMAN_TEXTS_FILE)
ai_texts = load_texts(AI_TEXTS_FILE)

print(f"Human-Texte: {len(human_texts)}")
print(f"AI-Texte:    {len(ai_texts)}")

# Einfache Validierung
bad_items = []
for item in human_texts + ai_texts:
    text = item.get("text", "")
    wc = word_count(text)
    if wc < MIN_WORD_COUNT or wc > MAX_WORD_COUNT:
        bad_items.append((item.get("id"), wc))

if bad_items:
    print("Warnung: Texte au√üerhalb der Wortanzahl 50‚Äì150:")
    for tid, wc in bad_items:
        print(f"  - {tid}: {wc} W√∂rter")
else:
    print("‚úì Alle Texte im erlaubten Wortbereich")

---

## 3. Perplexity berechnen

Berechne Perplexity f√ºr alle Texte und speichere `data/combined_data.csv`.

In [None]:
print(f"Lade Modell: {MODEL_NAME} auf {DEVICE}")
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
model.to(DEVICE)
model.eval()

tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)


def calculate_perplexity(text: str) -> float:
    encodings = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_TOKEN_LENGTH,
    )
    encodings = {k: v.to(DEVICE) for k, v in encodings.items()}
    with torch.no_grad():
        outputs = model(**encodings, labels=encodings["input_ids"])
        loss = outputs.loss
    return float(torch.exp(loss).item())


records = []

for item in human_texts:
    text = item.get("text", "")
    if not text:
        continue
    ppl = calculate_perplexity(text)
    records.append({
        "id": item.get("id"),
        "label": "human",
        "text": text,
        "source": item.get("source"),
        "topic": item.get("topic"),
        "word_count": word_count(text),
        "perplexity": ppl,
    })
    print(f"Human {item.get('id')}: {ppl:.2f}")

for item in ai_texts:
    text = item.get("text", "")
    if not text:
        continue
    ppl = calculate_perplexity(text)
    records.append({
        "id": item.get("id"),
        "label": "ai",
        "text": text,
        "source": item.get("source"),
        "topic": item.get("topic"),
        "word_count": word_count(text),
        "perplexity": ppl,
    })
    print(f"AI {item.get('id')}: {ppl:.2f}")


df = pd.DataFrame(records)
df.to_csv(COMBINED_CSV_FILE, index=False)
print(f"\n‚úì Gespeichert: {COMBINED_CSV_FILE} ({len(df)} Zeilen)")

---

## 4. Experiment 1: Statistischer Verteilungsvergleich

Vergleicht die Perplexity-Verteilungen von Human- und AI-Texten:
- Deskriptive Statistik (Mean, Std, Min, Max, Median)
- 95% Konfidenzintervalle
- t-Test f√ºr signifikante Unterschiede
- Cohen's d Effektgr√∂√üe
- Boxplot Visualisierung

In [None]:
# ===========================================================================# EXPERIMENT 1: STATISTISCHER VERTEILUNGSVERGLEICH# ===========================================================================print("="*70)print("EXPERIMENT 1: STATISTISCHER VERTEILUNGSVERGLEICH")print("="*70)# ---------------------------------------------------------------------------# Hilfsfunktionen# ---------------------------------------------------------------------------def calculate_confidence_interval(data, confidence=0.95):    """Berechnet 95% Konfidenzintervall f√ºr Mittelwert"""    n = len(data)    mean = np.mean(data)    se = stats.sem(data)  # Standard Error    t_value = stats.t.ppf((1 + confidence) / 2, n - 1)    margin = t_value * se    return (mean - margin, mean + margin)def cohens_d(group1, group2):    """Berechnet Cohen's d Effektgr√∂√üe"""    n1, n2 = len(group1), len(group2)    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)    pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))    return (np.mean(group1) - np.mean(group2)) / pooled_stddef interpret_cohens_d(d):    """Interpretiert Cohen's d"""    abs_d = abs(d)    if abs_d < 0.2:        return "Negligible"    elif abs_d < 0.5:        return "Small"    elif abs_d < 0.8:        return "Medium"    else:        return "Large"# ---------------------------------------------------------------------------# Deskriptive Statistik# ---------------------------------------------------------------------------# Daten aufteilendf_human = df[df['label'] == 'human'].copy()df_ai = df[df['label'] == 'ai'].copy()human_ppls = df_human['perplexity'].valuesai_ppls = df_ai['perplexity'].values# Statistiken berechnenstats_results = {    'human': {        'n': len(human_ppls),        'mean': float(np.mean(human_ppls)),        'std': float(np.std(human_ppls, ddof=1)),        'min': float(np.min(human_ppls)),        'max': float(np.max(human_ppls)),        'median': float(np.median(human_ppls)),        'ci': calculate_confidence_interval(human_ppls)    },    'ai': {        'n': len(ai_ppls),        'mean': float(np.mean(ai_ppls)),        'std': float(np.std(ai_ppls, ddof=1)),        'min': float(np.min(ai_ppls)),        'max': float(np.max(ai_ppls)),        'median': float(np.median(ai_ppls)),        'ci': calculate_confidence_interval(ai_ppls)    }}# Tabelle 1: Deskriptive Statistikprint("\n" + "="*70)print("TABELLE 1: Deskriptive Statistik")print("-"*70)print(f"{'Source':<12} {'n':<5} {'Mean':>8} {'Std':>8} {'95% CI':>20} {'Min':>8} {'Max':>8}")print("-"*70)for label in ['human', 'ai']:    s = stats_results[label]    ci_str = f"[{s['ci'][0]:.2f}, {s['ci'][1]:.2f}]"    print(f"{label.capitalize():<12} {s['n']:<5} {s['mean']:>8.2f} {s['std']:>8.2f} {ci_str:>20} {s['min']:>8.2f} {s['max']:>8.2f}")print("-"*70)# ---------------------------------------------------------------------------# Statistischer Test# ---------------------------------------------------------------------------print("\n" + "="*70)print("STATISTISCHER HYPOTHESENTEST")print("="*70)print("\nNull-Hypothese (H‚ÇÄ): Œº_human = Œº_AI")print("Alternativ-Hypothese (H‚ÇÅ): Œº_human ‚â† Œº_AI")print("Signifikanz-Level: Œ± = 0.05\n")# Levene-Test f√ºr Varianzhomogenit√§tlevene_stat, levene_p = stats.levene(human_ppls, ai_ppls)equal_var = levene_p > 0.05print(f"Levene-Test: p = {levene_p:.4f} ‚Üí Varianzen {'gleich' if equal_var else 'ungleich'}")# t-Testt_statistic, p_value = ttest_ind(human_ppls, ai_ppls, equal_var=equal_var)# Freiheitsgradeif equal_var:    df_test = len(human_ppls) + len(ai_ppls) - 2else:    n1, n2 = len(human_ppls), len(ai_ppls)    var1, var2 = np.var(human_ppls, ddof=1), np.var(ai_ppls, ddof=1)    df_test = ((var1/n1 + var2/n2)**2) / ((var1/n1)**2/(n1-1) + (var2/n2)**2/(n2-1))# Cohen's deffect_size = cohens_d(human_ppls, ai_ppls)effect_interp = interpret_cohens_d(effect_size)is_significant = p_value < 0.05# Tabelle 2: Test-Ergebnisseprint("\nTABELLE 2: Statistische Test-Ergebnisse")print("-"*70)print(f"t-Statistik:     {t_statistic:.3f}")print(f"p-Wert:          {p_value:.4f}  {'‚Üí Signifikant!' if is_significant else '‚Üí Nicht signifikant'}")print(f"Freiheitsgrade:  {df_test:.1f}")print(f"Cohen's d:       {effect_size:.3f}  ‚Üí {effect_interp}")print("-"*70)# Interpretationprint("\nINTERPRETATION:")if is_significant:    direction = "h√∂her" if stats_results['human']['mean'] > stats_results['ai']['mean'] else "niedriger"    print(f"‚úì Es gibt einen statistisch signifikanten Unterschied (p < 0.05).")    print(f"  Human-Texte haben {direction}e Perplexity als AI-Texte.")    print(f"  Effektgr√∂√üe: {effect_interp}")else:    print(f"‚úó Kein statistisch signifikanter Unterschied (p > 0.05).")# Ergebnisse speichernwith open(RESULTS_DIR / "experiment1_statistics.json", 'w') as f:    save_stats = stats_results.copy()    save_stats['human']['ci'] = list(save_stats['human']['ci'])    save_stats['ai']['ci'] = list(save_stats['ai']['ci'])    save_stats['test'] = {        't_statistic': float(t_statistic),        'p_value': float(p_value),        'cohens_d': float(effect_size),        'is_significant': bool(is_significant)    }    json.dump(save_stats, f, indent=2)print(f"\n‚úì Ergebnisse gespeichert: {RESULTS_DIR / 'experiment1_statistics.json'}")

In [None]:
# ---------------------------------------------------------------------------
# Visualisierung: Boxplot
# ---------------------------------------------------------------------------

print("\nErstelle Boxplot...")

fig, ax = plt.subplots(figsize=(10, 7))

# Boxplot
data_for_plot = [human_ppls, ai_ppls]
labels = ['Human', 'AI']
bp = ax.boxplot(data_for_plot, labels=labels, patch_artist=True,
                showmeans=True, meanprops={'marker': 'D', 'markerfacecolor': 'red', 'markersize': 8})

# Farben
colors = ['lightblue', 'lightcoral']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

# Konfidenzintervalle
means = [stats_results['human']['mean'], stats_results['ai']['mean']]
cis = [stats_results['human']['ci'], stats_results['ai']['ci']]
errors = np.array([[means[i] - cis[i][0], cis[i][1] - means[i]] for i in range(2)]).T
ax.errorbar([1, 2], means, yerr=errors, fmt='none', ecolor='darkred',
            elinewidth=2, capsize=5, capthick=2, label='95% CI')

# Beschriftung
ax.set_title('Perplexity Distribution: Human vs. AI Text', fontsize=14, fontweight='bold')
ax.set_ylabel('Perplexity', fontsize=12)
ax.set_xlabel('Source', fontsize=12)
ax.legend(loc='upper right')
ax.grid(True, alpha=0.3, axis='y')

# Statistik-Box
stats_text = (
    f"t = {t_statistic:.3f}\n"
    f"p = {p_value:.4f}\n"
    f"d = {effect_size:.3f} ({effect_interp})\n"
    f"{'Signifikant ‚úì' if is_significant else 'Nicht signifikant ‚úó'}"
)
ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, fontsize=10,
        verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.savefig(FIGURES_DIR / "experiment1_boxplot.png", dpi=300, bbox_inches='tight')
print(f"‚úì Plot gespeichert: {FIGURES_DIR / 'experiment1_boxplot.png'}")
plt.show()

---

## 5. Experiment 2: Klassifikations-Performance

Testet wie gut Perplexity als AI-Detektor funktioniert:
- ROC-Kurve f√ºr alle Thresholds
- AUC (Area Under Curve)
- Optimalen Threshold finden (Youden-Index)
- Accuracy, Precision, Recall, F1-Score
- Confusion Matrix

In [None]:
# ===========================================================================
# EXPERIMENT 2: KLASSIFIKATIONS-PERFORMANCE
# ===========================================================================

print("\n\n" + "="*70)
print("EXPERIMENT 2: KLASSIFIKATIONS-PERFORMANCE")
print("="*70)

# Labels und Scores vorbereiten
labels_binary = (df['label'] == 'human').astype(int).values  # 1=Human, 0=AI
scores = df['perplexity'].values  # H√∂here PPL ‚Üí eher Human

# ROC-Kurve
fpr, tpr, thresholds = roc_curve(labels_binary, scores)
roc_auc = auc(fpr, tpr)

# Optimaler Threshold (Youden-Index)
youden_index = tpr - fpr
optimal_idx = np.argmax(youden_index)
optimal_threshold = thresholds[optimal_idx]
optimal_tpr = tpr[optimal_idx]
optimal_fpr = fpr[optimal_idx]

# Predictions bei optimalem Threshold
predictions = (scores >= optimal_threshold).astype(int)

# Metriken berechnen
accuracy = accuracy_score(labels_binary, predictions)
precision = precision_score(labels_binary, predictions, zero_division=0)
recall = recall_score(labels_binary, predictions, zero_division=0)
f1 = f1_score(labels_binary, predictions, zero_division=0)

# Confusion Matrix
cm = confusion_matrix(labels_binary, predictions)
tn, fp, fn, tp = cm.ravel()

# Tabelle 3: Klassifikations-Performance
print("\nTABELLE 3: Klassifikations-Performance")
print("-"*70)
print(f"ROC-AUC:           {roc_auc:.3f}")
print(f"Optimal Threshold: {optimal_threshold:.2f}")
print(f"Accuracy:          {accuracy:.3f} ({accuracy*100:.1f}%)")
print(f"Precision (Human): {precision:.3f}")
print(f"Recall (Human):    {recall:.3f}")
print(f"F1-Score:          {f1:.3f}")
print("-"*70)
print(f"\nConfusion Matrix:")
print(f"  True Negatives (TN):  {tn:>3}")
print(f"  False Positives (FP): {fp:>3}")
print(f"  False Negatives (FN): {fn:>3}")
print(f"  True Positives (TP):  {tp:>3}")
print("-"*70)

# Ergebnisse speichern
classification_results = {
    'roc_auc': float(roc_auc),
    'optimal_threshold': float(optimal_threshold),
    'accuracy': float(accuracy),
    'precision': float(precision),
    'recall': float(recall),
    'f1_score': float(f1),
    'confusion_matrix': {'tn': int(tn), 'fp': int(fp), 'fn': int(fn), 'tp': int(tp)}
}

with open(RESULTS_DIR / "experiment2_classification.json", 'w') as f:
    json.dump(classification_results, f, indent=2)

print(f"\n‚úì Ergebnisse gespeichert: {RESULTS_DIR / 'experiment2_classification.json'}")

In [None]:
# ROC-Kurve plotten
print("\nErstelle ROC-Kurve...")

fig, ax = plt.subplots(figsize=(8, 8))

# ROC-Kurve
ax.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')

# Diagonale (Random Classifier)
ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')

# Optimaler Punkt
ax.plot(optimal_fpr, optimal_tpr, 'ro', markersize=10,
        label=f'Optimal (Threshold={optimal_threshold:.2f})')

# Beschriftung
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)
ax.set_title('ROC Curve: Perplexity as AI-Text Detector', fontsize=14, fontweight='bold')
ax.legend(loc="lower right")
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(FIGURES_DIR / "experiment2_roc_curve.png", dpi=300, bbox_inches='tight')
print(f"‚úì Plot gespeichert: {FIGURES_DIR / 'experiment2_roc_curve.png'}")
plt.show()

---

## 6. Experiment 3: Error Analysis

Analysiert welche Texte falsch klassifiziert werden:
- False Positives (AI als Human klassifiziert)
- False Negatives (Human als AI klassifiziert)
- Beispiele anzeigen
- Muster nach Topics analysieren

In [None]:
# ===========================================================================
# EXPERIMENT 3: ERROR ANALYSIS
# ===========================================================================

print("\n\n" + "="*70)
print("EXPERIMENT 3: ERROR ANALYSIS")
print("="*70)

# Misclassifications identifizieren
df['prediction'] = predictions
df['predicted_label'] = df['prediction'].map({1: 'human', 0: 'ai'})
df['correct'] = df['label'] == df['predicted_label']

# False Positives: AI als Human klassifiziert
false_positives = df[(df['label'] == 'ai') & (df['predicted_label'] == 'human')].copy()
false_positives = false_positives.sort_values('perplexity', ascending=False)

# False Negatives: Human als AI klassifiziert
false_negatives = df[(df['label'] == 'human') & (df['predicted_label'] == 'ai')].copy()
false_negatives = false_negatives.sort_values('perplexity', ascending=True)

print(f"\nFehleranalyse:")
print(f"  - Korrekt klassifiziert: {df['correct'].sum()}/{len(df)} ({df['correct'].sum()/len(df)*100:.1f}%)")
print(f"  - False Positives (AI‚ÜíHuman): {len(false_positives)}")
print(f"  - False Negatives (Human‚ÜíAI): {len(false_negatives)}")

# Beispiele anzeigen
print("\n" + "-"*70)
print("FALSE POSITIVES: AI-Texte f√§lschlich als Human klassifiziert")
print("-"*70)

if len(false_positives) > 0:
    for idx, (i, row) in enumerate(false_positives.head(3).iterrows(), 1):
        print(f"\n{idx}. ID: {row['id']} | PPL: {row['perplexity']:.2f} | Topic: {row['topic']}")
        print(f"   Text: {row['text'][:150]}...")
        print(f"   ‚Üí Warum hohe PPL? Ungew√∂hnliche Formulierung f√ºr AI?")
else:
    print("\n‚úì Keine False Positives!")

print("\n" + "-"*70)
print("FALSE NEGATIVES: Human-Texte f√§lschlich als AI klassifiziert")
print("-"*70)

if len(false_negatives) > 0:
    for idx, (i, row) in enumerate(false_negatives.head(3).iterrows(), 1):
        print(f"\n{idx}. ID: {row['id']} | PPL: {row['perplexity']:.2f} | Topic: {row['topic']}")
        print(f"   Text: {row['text'][:150]}...")
        print(f"   ‚Üí Warum niedrige PPL? Sehr generischer/Standard-Text?")
else:
    print("\n‚úì Keine False Negatives!")

# Muster-Analyse
print("\n" + "-"*70)
print("MUSTER-ANALYSE")
print("-"*70)

if len(false_positives) > 0:
    fp_topics = false_positives['topic'].value_counts()
    print(f"\nFalse Positives nach Topic:")
    print(fp_topics.to_string())

if len(false_negatives) > 0:
    fn_topics = false_negatives['topic'].value_counts()
    print(f"\nFalse Negatives nach Topic:")
    print(fn_topics.to_string())

# Ergebnisse speichern
error_analysis = {
    'total_samples': len(df),
    'correct': int(df['correct'].sum()),
    'accuracy_pct': float(df['correct'].sum()/len(df)*100),
    'false_positives': {
        'count': len(false_positives),
        'examples': false_positives.head(5)[['id', 'perplexity', 'topic', 'text']].to_dict('records')
    },
    'false_negatives': {
        'count': len(false_negatives),
        'examples': false_negatives.head(5)[['id', 'perplexity', 'topic', 'text']].to_dict('records')
    }
}

with open(RESULTS_DIR / "experiment3_error_analysis.json", 'w') as f:
    json.dump(error_analysis, f, indent=2, ensure_ascii=False)

print(f"\n‚úì Ergebnisse gespeichert: {RESULTS_DIR / 'experiment3_error_analysis.json'}")

---

## 7. Finale Zusammenfassung

√úbersicht √ºber alle Ergebnisse

In [None]:
# ===========================================================================
# ZUSAMMENFASSUNG
# ===========================================================================

print("\n\n" + "="*70)
print("FINALE ZUSAMMENFASSUNG")
print("="*70)

print(f"\nüìä DATEN:")
print(f"  - Gesamt: {len(df)} Texte ({len(df_human)} Human + {len(df_ai)} AI)")
print(f"  - Wortanzahl: Human {df_human['word_count'].mean():.1f}¬±{df_human['word_count'].std():.1f}, AI {df_ai['word_count'].mean():.1f}¬±{df_ai['word_count'].std():.1f}")

print(f"\nüìà EXPERIMENT 1: Statistischer Vergleich")
print(f"  - Human PPL: {stats_results['human']['mean']:.2f} ¬± {stats_results['human']['std']:.2f}")
print(f"  - AI PPL: {stats_results['ai']['mean']:.2f} ¬± {stats_results['ai']['std']:.2f}")
print(f"  - t-Test: t={t_statistic:.3f}, p={p_value:.4f} ‚Üí {'Signifikant' if is_significant else 'Nicht signifikant'}")
print(f"  - Cohen's d: {effect_size:.3f} ({effect_interp})")

print(f"\nüéØ EXPERIMENT 2: Klassifikation")
print(f"  - ROC-AUC: {roc_auc:.3f}")
print(f"  - Accuracy: {accuracy*100:.1f}%")
print(f"  - Precision: {precision:.3f}, Recall: {recall:.3f}")
print(f"  - Optimaler Threshold: {optimal_threshold:.2f}")

print(f"\nüîç EXPERIMENT 3: Error Analysis")
print(f"  - Korrekt: {df['correct'].sum()}/{len(df)} ({df['correct'].sum()/len(df)*100:.1f}%)")
print(f"  - False Positives: {len(false_positives)}")
print(f"  - False Negatives: {len(false_negatives)}")

print(f"\nüìÅ ERGEBNISSE:")
print(f"  - {RESULTS_DIR}/experiment1_statistics.json")
print(f"  - {RESULTS_DIR}/experiment2_classification.json")
print(f"  - {RESULTS_DIR}/experiment3_error_analysis.json")

print(f"\nüìä PLOTS:")
print(f"  - {FIGURES_DIR}/experiment1_boxplot.png")
print(f"  - {FIGURES_DIR}/experiment2_roc_curve.png")

print(f"\nüíæ DATEN:")
print(f"  - {COMBINED_CSV_FILE} (mit allen Perplexity-Werten)")

print("\n" + "="*70)
print("‚úÖ ALLE EXPERIMENTE ABGESCHLOSSEN!")
print("="*70)
print("\nN√§chste Schritte:")
print("  1. Pr√ºfe die Plots in figures/")
print("  2. √ñffne die JSON-Ergebnisse in results/")
print("  3. Nutze die Tabellen & Plots f√ºr euer Paper")
print("  4. Schreibt das Paper im IEEE-Format")
print("\nüéì Viel Erfolg beim Paper-Schreiben!")