# VDEH Language Detection Pipeline

**Fokus:** Professionelle Sprach-Erkennung mit langdetect

## 🎯 Ziel
- Sprach-Erkennung für alle VDEH Titel
- Confidence-Scoring und Qualitätsfilter
- Sprachname-Mapping und Kategorisierung
- Export der sprach-angereicherten Daten

## 📚 Input/Output
- **Input**: `data/vdeh/processed/02_preprocessed_data.parquet`
- **Output**: `data/vdeh/processed/03_language_detected_data.parquet`

In [None]:
# 🛠️ SETUP: Initialize notebook environment
import sys
from pathlib import Path

# Add src to path (temporary until utils is imported)
project_root = Path.cwd()
while not (project_root / 'config.yaml').exists() and project_root.parent != project_root:
    project_root = project_root.parent
sys.path.insert(0, str(project_root / 'src'))

# Now use the utility function
from utils.notebook_utils import setup_notebook

project_root, config = setup_notebook()
print(f"✅ Project root: {project_root}")
print(f"✅ Project: {config.get('project.name')} v{config.get('project.version')}")

In [2]:
# 📂 DATEN AUS VORHERIGER STUFE LADEN
import pandas as pd
import json

processed_dir = config.project_root / config.get('paths.data.vdeh.processed')
input_path = processed_dir / '02_preprocessed_data.parquet'
metadata_path = processed_dir / '02_metadata.json'

if not input_path.exists():
    raise FileNotFoundError(f"Input-Datei nicht gefunden: {input_path}\n"
                          "Bitte führen Sie zuerst 02_vdeh_data_preprocessing.ipynb aus.")

# Daten laden
df_vdeh = pd.read_parquet(input_path)

# Vorherige Metadaten laden
with open(metadata_path, 'r') as f:
    prev_metadata = json.load(f)

print(f"📂 Daten geladen aus: {input_path}")
print(f"📊 Records: {len(df_vdeh):,}")
print(f"📋 Spalten: {list(df_vdeh.columns)}")
print(f"📅 Vorherige Verarbeitung: {prev_metadata['processing_date']}")

📂 Daten geladen aus: /media/sz/Data/Bibo/analysis/data/vdeh/processed/02_preprocessed_data.parquet
📊 Records: 58,760
📋 Spalten: ['id', 'title', 'authors', 'authors_affiliation', 'year', 'publisher', 'isbn', 'issn', 'authors_str', 'num_authors', 'authors_affiliation_str', 'num_authors_affiliation', 'isbn_valid', 'isbn_status', 'issn_valid', 'issn_status']
📅 Vorherige Verarbeitung: 2025-11-06T12:51:10.287634


In [None]:
# 🌍 SPRACH-ERKENNUNG SETUP
import pandas as pd
import numpy as np
import json

# Import language detection and progress bar
from langdetect import detect_langs, LangDetectException
from tqdm import tqdm

print("🌍 === SPRACH-ERKENNUNG SETUP ===")
print("✅ langdetect imported")
print("✅ tqdm imported für Progress-Anzeige")
print("✅ Sprach-Erkennung Setup abgeschlossen")

In [4]:
# 🎯 TITEL FILTERN 
print("🔍 Filtere Titel für Spracherkennung...")

# Minimum Textlänge aus Config
min_length = config.get('data_processing.language_detection.min_text_length', 10)

# Leere oder zu kurze Titel filtern
mask = df_vdeh['title'].notna() & (df_vdeh['title'].str.len() >= min_length)
titles_to_process = df_vdeh[mask]

print(f"📊 Ursprüngliche Anzahl Titel: {len(df_vdeh):,}")
print(f"📊 Titel für Spracherkennung: {len(titles_to_process):,}")
print(f"⏭️  Überspringe {len(df_vdeh) - len(titles_to_process):,} leere/zu kurze Titel")

🔍 Filtere Titel für Spracherkennung...
📊 Ursprüngliche Anzahl Titel: 58,760
📊 Titel für Spracherkennung: 40,544
⏭️  Überspringe 18,216 leere/zu kurze Titel


In [5]:
# 🎯 SPRACH-ERKENNUNG FUNKTION
def detect_language_professional(text):
    """
    Professionelle Sprach-Erkennung mit langdetect (konfigurationsbasiert)
    
    Returns:
        tuple: (language_code, confidence, full_language_name)
    """
    min_length = config.get('data_processing.language_detection.min_text_length', 10)
    
    if not text or pd.isna(text) or len(str(text).strip()) < min_length:
        return 'unknown', 0.0, 'Unknown'
    
    try:
        # Text bereinigen
        clean_text = str(text).encode('utf-8', errors='ignore').decode('utf-8').strip()
        
        if len(clean_text) < min_length:
            return 'unknown', 0.0, 'Unknown'
        
        # Sprach-Erkennung
        lang_probs = detect_langs(clean_text)
        best_match = lang_probs[0]
        
        lang_code = best_match.lang
        confidence = round(best_match.prob, 3)
        
        # Standard-Mapping für häufige Sprachen
        supported_langs = {
            'de': 'German',
            'en': 'English', 
            'fr': 'French',
            'es': 'Spanish',
            'it': 'Italian',
            'nl': 'Dutch',
            'pl': 'Polish',
            'ru': 'Russian',
            'ja': 'Japanese',
            'zh': 'Chinese',
            'uk': 'Ukrainian',
            'cs': 'Czech',
            'hu': 'Hungarian',
            'da': 'Danish',
            'fi': 'Finnish',
            'no': 'Norwegian',
            'sv': 'Swedish'  
        }
        
        # Ergänze durch Konfiguration
        config_langs = config.get('data_processing.language_detection.supported_languages', {})
        if isinstance(config_langs, dict):
            supported_langs.update(config_langs)
        elif isinstance(config_langs, list):
            for lang_item in config_langs:
                if isinstance(lang_item, dict):
                    supported_langs.update(lang_item)
        
        # Finde den vollständigen Namen
        full_name = supported_langs.get(lang_code, lang_code.upper())
        
        return lang_code, confidence, full_name
        
    except (LangDetectException, Exception):
        return 'unknown', 0.0, 'Unknown'

print("✅ Sprach-Erkennungsfunktion definiert")
print(f"📏 Min. Textlänge: {config.get('data_processing.language_detection.min_text_length')}")

# Test der Funktion
test_titles = ["Das ist ein deutscher Titel", "This is an English title", "Ceci est un titre français"]
print(f"\n🧪 === FUNKTIONSTEST ===")
for title in test_titles:
    lang_code, confidence, lang_name = detect_language_professional(title)
    print(f"   '{title[:30]}...' → {lang_code} ({confidence:.3f}) - {lang_name}")

✅ Sprach-Erkennungsfunktion definiert
📏 Min. Textlänge: 10

🧪 === FUNKTIONSTEST ===
   'Das ist ein deutscher Titel...' → de (1.000) - German
   'This is an English title...' → en (1.000) - English
   'Ceci est un titre français...' → fr (1.000) - French


In [6]:
# 🌍 SPRACHERKENNUNG DURCHFÜHREN
print("🌍 Führe Spracherkennung durch...")

# Arrays für Ergebnisse initialisieren
language_codes = []
confidence_scores = []
language_names = []

# Progress Bar
with tqdm(titles_to_process['title'], desc="🌍 Sprach-Erkennung") as pbar:
    for title in pbar:
        lang_code, confidence, lang_name = detect_language_professional(title)
        language_codes.append(lang_code)
        confidence_scores.append(confidence)
        language_names.append(lang_name)

print(f"\n✅ Spracherkennung für {len(titles_to_process):,} Titel abgeschlossen")

🌍 Führe Spracherkennung durch...


🌍 Sprach-Erkennung: 100%|██████████| 40544/40544 [01:31<00:00, 445.26it/s]


✅ Spracherkennung für 40,544 Titel abgeschlossen





In [7]:
# 📊 SPRACH-DATEN ZUM DATAFRAME HINZUFÜGEN
print("📊 === SPRACH-DATEN INTEGRATION ===")

# Neue Spalten initialisieren
df_vdeh['lang_code'] = 'unknown'
df_vdeh['lang_confidence'] = 0.0
df_vdeh['lang_name'] = 'Unknown'

# Ergebnisse einfügen (nur für verarbeitete Titel)
df_vdeh.loc[titles_to_process.index, 'lang_code'] = language_codes
df_vdeh.loc[titles_to_process.index, 'lang_confidence'] = confidence_scores
df_vdeh.loc[titles_to_process.index, 'lang_name'] = language_names

print(f"✅ Sprach-Spalten hinzugefügt")
print(f"📊 DataFrame: {len(df_vdeh.columns)} Spalten, {len(df_vdeh):,} Zeilen")

# Qualitäts-Statistiken
min_confidence = config.get('analysis.quality_filters.min_confidence_score', 0.3)
high_confidence_count = sum(1 for c in confidence_scores if c >= min_confidence)
print(f"🎯 Hohe Konfidenz (>={min_confidence}): {high_confidence_count:,} Titel")

# Sprach-Verteilung
lang_dist = pd.Series(language_names).value_counts().head(10)
print(f"\n🌍 Top 10 Sprachen:")
for lang, count in lang_dist.items():
    pct = count/len(language_names)*100 if len(language_names) > 0 else 0
    print(f"   {lang:15}: {count:6,} ({pct:5.1f}%)")

📊 === SPRACH-DATEN INTEGRATION ===
✅ Sprach-Spalten hinzugefügt
📊 DataFrame: 19 Spalten, 58,760 Zeilen
🎯 Hohe Konfidenz (>=0.3): 40,526 Titel

🌍 Top 10 Sprachen:
   German         : 25,184 ( 62.1%)
   English        : 10,916 ( 26.9%)
   French         :    639 (  1.6%)
   Italian        :    533 (  1.3%)
   CA             :    319 (  0.8%)
   SL             :    303 (  0.7%)
   Dutch          :    279 (  0.7%)
   AF             :    270 (  0.7%)
   RO             :    267 (  0.7%)
   ET             :    204 (  0.5%)


In [8]:
# 💾 LANGUAGE-DETECTED DATEN EXPORTIEREN
output_path = processed_dir / '03_language_detected_data.parquet'
df_vdeh.to_parquet(output_path, index=False)

print(f"💾 === LANGUAGE DETECTION ABGESCHLOSSEN ===")
print(f"✅ Language-detected Daten exportiert: {output_path}")
print(f"📊 Records: {len(df_vdeh):,}")
print(f"📋 Spalten: {len(df_vdeh.columns)} (inkl. {len([c for c in df_vdeh.columns if 'lang' in c])} Sprach-Spalten)")
print(f"💾 Dateigröße: {df_vdeh.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Detaillierte Sprach-Statistiken für Metadaten
lang_stats = {
    'total_titles_analyzed': int(len(titles_to_process)),  # titles_to_process statt all_titles
    'high_confidence_count': int(high_confidence_count),
    'confidence_threshold': float(min_confidence),
    'top_languages': {k: int(v) for k,v in dict(lang_dist.head(5)).items()},
    'confidence_distribution': {
        'mean': float(np.mean(confidence_scores)),
        'median': float(np.median(confidence_scores)),
        'std': float(np.std(confidence_scores))
    }
}

💾 === LANGUAGE DETECTION ABGESCHLOSSEN ===
✅ Language-detected Daten exportiert: /media/sz/Data/Bibo/analysis/data/vdeh/processed/03_language_detected_data.parquet
📊 Records: 58,760
📋 Spalten: 19 (inkl. 3 Sprach-Spalten)
💾 Dateigröße: 52.8 MB


In [9]:
# Statistische Auswertung erstellen
from datetime import datetime

lang_stats = {
    'processing_date': datetime.now().isoformat(),
    'num_total': int(len(df_vdeh)),
    'num_processed': int(df_vdeh['lang_code'].notna().sum()),
    'num_errors': int(df_vdeh.get('detect_error', pd.Series(dtype='bool')).notna().sum()),
    'language_analysis': {
        'total_titles_analyzed': int(len(titles_to_process)),
        'high_confidence_count': int(sum(df_vdeh['lang_confidence'] >= config.get('data_processing.language_detection.confidence_threshold', 0.5)))
    },
    'lang_distribution': {
        str(lang): int(count.item()) if hasattr(count, 'item') else int(count)  # Konvertiere NumPy Typen
        for lang, count in df_vdeh['lang_code'].value_counts().to_dict().items()
        if not pd.isna(lang)
    }
}

# Metadaten speichern
metadata_path = processed_dir / '03_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(lang_stats, f, indent=2)

print(f"📄 Metadaten gespeichert: {metadata_path}")
print(f"\n➡️  Nächster Schritt: 04_vdeh_quality_analysis.ipynb")

📄 Metadaten gespeichert: /media/sz/Data/Bibo/analysis/data/vdeh/processed/03_metadata.json

➡️  Nächster Schritt: 04_vdeh_quality_analysis.ipynb
