# VDEH Data Enrichment - Library of Congress (LoC)

**Fokus:** Datenanreicherung englischsprachiger Literatur √ºber Library of Congress (LoC) API

## üéØ Ziel
- Anreicherung englischsprachiger Datens√§tze via LoC API (ISBN/ISSN)
- Erg√§nzung zu DNB-Daten f√ºr internationale Literatur
- Fokus auf Records mit `detected_language = 'en'`
- Validierung und Qualit√§tsverbesserung

## üìö Input/Output
- **Input**: `data/vdeh/processed/03_language_detected_data.parquet`
- **Output**: `data/vdeh/processed/04b_loc_enriched_data.parquet`

## üîó API
- **LoC SRU API**: https://www.loc.gov/z3950/
- **Endpoint**: https://lx2.loc.gov:210/lcdb
- **Abfrage**: ISBN/ISSN basierte Suche, Titel/Autor


In [1]:
# üõ†Ô∏è SETUP UND DATEN LADEN
import sys
from pathlib import Path
import time
import pandas as pd
import json

# Add project root to path
project_root = Path.cwd()
while not (project_root / 'config.yaml').exists() and project_root.parent != project_root:
    project_root = project_root.parent
sys.path.insert(0, str(project_root / 'src'))

from utils.notebook_utils import setup_notebook

project_root, config = setup_notebook()
print(f"‚úÖ Project root: {project_root}")
print(f"‚úÖ Project: {config.get('project.name')} v{config.get('project.version')}")

# LoC API laden
from loc_api import LOC_SRU_BASE, query_loc_by_isbn, query_loc_by_issn, query_loc_by_title_author
print(f"‚úÖ LoC API Funktionen geladen")

2025-12-25 21:57:58 - utils.notebook_utils - INFO - Searching for project root...


2025-12-25 21:57:58 - utils.notebook_utils - INFO - Project root found: /media/sz/Data/Bibo/analysis


2025-12-25 21:57:58 - utils.notebook_utils - INFO - Loading configuration...


2025-12-25 21:57:58 - config_loader - INFO - Configuration loaded from /media/sz/Data/Bibo/analysis/config.yaml


2025-12-25 21:57:58 - utils.notebook_utils - INFO - Configuration loaded successfully: Dual-Source Bibliothek Bestandsvergleich


‚úÖ Project root: /media/sz/Data/Bibo/analysis
‚úÖ Project: Dual-Source Bibliothek Bestandsvergleich v2.2.0
‚úÖ LoC API Funktionen geladen


In [2]:
# üìÇ DATEN AUS VORHERIGER STUFE LADEN
processed_dir = config.project_root / config.get('paths.data.vdeh.processed')
input_path = processed_dir / '03_language_detected_data.parquet'
metadata_path = processed_dir / '03_metadata.json'

if not input_path.exists():
    raise FileNotFoundError(f"Input-Datei nicht gefunden: {input_path}\n"
                          "Bitte f√ºhren Sie zuerst 03_vdeh_language_detection.ipynb aus.")

# Daten laden
df_vdeh = pd.read_parquet(input_path)

# Vorherige Metadaten laden
with open(metadata_path, 'r') as f:
    prev_metadata = json.load(f)

print(f"üìÇ Daten geladen aus: {input_path}")
print(f"üìä Records: {len(df_vdeh):,}")
print(f"üìã Spalten: {list(df_vdeh.columns)}")
print(f"üíæ Memory: {df_vdeh.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Sprachverteilung
if 'detected_language' in df_vdeh.columns:
    lang_dist = df_vdeh['detected_language'].value_counts()
    print(f"\nüìä Sprachverteilung:")
    for lang, count in lang_dist.head(10).items():
        print(f"   {lang}: {count:,} ({count/len(df_vdeh)*100:.1f}%)")

üìÇ Daten geladen aus: /media/sz/Data/Bibo/analysis/data/vdeh/processed/03_language_detected_data.parquet
üìä Records: 58,305
üìã Spalten: ['id', 'title', 'authors', 'authors_affiliation', 'year', 'publisher', 'isbn', 'issn', 'pages', 'language', 'authors_str', 'num_authors', 'authors_affiliation_str', 'num_authors_affiliation', 'isbn_valid', 'isbn_status', 'issn_valid', 'issn_status', 'detected_language', 'detected_language_confidence', 'detected_language_name']


üíæ Memory: 54.5 MB

üìä Sprachverteilung:
   de: 32,543 (55.8%)
   en: 16,516 (28.3%)
   unknown: 1,446 (2.5%)
   fr: 929 (1.6%)
   it: 773 (1.3%)
   sl: 654 (1.1%)
   ro: 625 (1.1%)
   af: 520 (0.9%)
   nl: 497 (0.9%)
   ca: 493 (0.8%)


In [3]:
# üîç KANDIDATEN F√úR LOC-ANREICHERUNG IDENTIFIZIEREN
print("üîç === KANDIDATEN-IDENTIFIKATION (ENGLISCHSPRACHIGE LITERATUR) ===\n")

# Filter: Nur englischsprachige Records
df_english = df_vdeh[df_vdeh['detected_language'] == 'en'].copy()

print(f"üìö Englischsprachige Records: {len(df_english):,} ({len(df_english)/len(df_vdeh)*100:.1f}%)")

# 1. ISBN-basierte Kandidaten (ALLE mit ISBN)
if 'isbn' in df_english.columns:
    has_isbn = df_english['isbn'].notna()
    isbn_candidates = df_english[has_isbn].copy()
    
    print(f"\nüìã Kriterium 1: ISBN vorhanden")
    print(f"   ISBN-Kandidaten: {len(isbn_candidates):,}")

# 2. ISSN-basierte Kandidaten (nur unvollst√§ndig)
if 'issn' in df_english.columns:
    has_issn = df_english['issn'].notna()
    missing_title = df_english['title'].isna()
    missing_authors = (df_english['authors_str'].isna()) | (df_english['authors_str'] == '')
    missing_year = df_english['year'].isna()
    
    issn_candidates = df_english[
        has_issn & (missing_title | missing_authors | missing_year)
    ].copy()
    
    print(f"\nüìã Kriterium 2: ISSN vorhanden (nur unvollst√§ndig)")
    print(f"   ISSN-Kandidaten: {len(issn_candidates):,}")

# 3. Titel/Autor-Kandidaten (ohne ISBN aber mit Titel + Autoren)
no_isbn_but_searchable = df_english[
    (df_english['isbn'].isna()) &
    (df_english['title'].notna()) &
    (df_english['authors_str'].notna()) &
    (df_english['authors_str'] != '')
].copy()

print(f"\nüìã Kriterium 3: Ohne ISBN aber mit Titel + Autoren")
print(f"   Kandidaten: {len(no_isbn_but_searchable):,}")

# Finale Kandidatenliste
final_candidates = df_english[
    (df_english['isbn'].notna()) |
    (
        (df_english['isbn'].isna()) &
        (df_english['title'].notna()) &
        (
            ((df_english['authors_str'].notna()) & (df_english['authors_str'] != '')) |
            (df_english['year'].notna())
        )
    )
].copy()

print(f"\nüéØ Finale LoC-Anreicherungs-Kandidaten: {len(final_candidates):,}")
print(f"   Mit ISBN: {final_candidates['isbn'].notna().sum():,}")
print(f"   Mit ISSN: {final_candidates['issn'].notna().sum():,}")
print(f"   Mit Titel+Autoren: {((final_candidates['title'].notna()) & (final_candidates['authors_str'].notna()) & (final_candidates['authors_str'] != '')).sum():,}")


üîç === KANDIDATEN-IDENTIFIKATION (ENGLISCHSPRACHIGE LITERATUR) ===

üìö Englischsprachige Records: 16,516 (28.3%)

üìã Kriterium 1: ISBN vorhanden
   ISBN-Kandidaten: 3,695

üìã Kriterium 2: ISSN vorhanden (nur unvollst√§ndig)
   ISSN-Kandidaten: 224

üìã Kriterium 3: Ohne ISBN aber mit Titel + Autoren
   Kandidaten: 2,112

üéØ Finale LoC-Anreicherungs-Kandidaten: 9,900
   Mit ISBN: 3,695
   Mit ISSN: 103
   Mit Titel+Autoren: 4,770


In [4]:
# üåê LOC API STATUS
print("üåê === LIBRARY OF CONGRESS API STATUS ===\n")

print("‚úÖ LoC API Funktionen aus src/loc_api.py geladen")
print(f"   Base URL: {LOC_SRU_BASE}")
print(f"   Schema: MARC21-xml")
print(f"   Verf√ºgbare Funktionen:")
print(f"     - query_loc_by_isbn(isbn, max_records=1)")
print(f"     - query_loc_by_issn(issn, max_records=1)")
print(f"     - query_loc_by_title_author(title, author=None, max_records=1)")


üåê === LIBRARY OF CONGRESS API STATUS ===

‚úÖ LoC API Funktionen aus src/loc_api.py geladen
   Base URL: http://lx2.loc.gov:210/lcdb
   Schema: MARC21-xml
   Verf√ºgbare Funktionen:
     - query_loc_by_isbn(isbn, max_records=1)
     - query_loc_by_issn(issn, max_records=1)
     - query_loc_by_title_author(title, author=None, max_records=1)


In [5]:
# üöÄ LOC DATENABFRAGE - ISBN/ISSN
print("üöÄ === LOC DATENABFRAGE (ISBN/ISSN) ===\n")

# Konfiguration
RATE_LIMIT_DELAY = 10.0  # Sekunden zwischen Anfragen (erh√∂ht wegen Verbindungsproblemen)
SAVE_INTERVAL = 50  # Speichere alle N Abfragen
LOC_DATA_FILE = processed_dir / 'loc_raw_data.parquet'

print(f"‚öôÔ∏è  Konfiguration:")
print(f"   Rate Limit: {RATE_LIMIT_DELAY}s pro Anfrage")
print(f"   Save Interval: Alle {SAVE_INTERVAL} Queries")
print(f"   Output: {LOC_DATA_FILE.name}")

# Lade vorhandene LoC-Daten (falls vorhanden)
if LOC_DATA_FILE.exists():
    print(f"\nüìÇ Lade vorhandene LoC-Daten...")
    loc_data_df = pd.read_parquet(LOC_DATA_FILE)
    print(f"   Bereits abgefragt: {len(loc_data_df):,}")
    print(f"   Davon erfolgreich: {(loc_data_df['loc_found'] == True).sum():,}")
else:
    print(f"\nüìÇ Keine vorhandenen LoC-Daten gefunden - starte neue Abfrage")
    loc_data_df = pd.DataFrame(columns=[
        'vdeh_id', 'query_type', 'query_value',
        'loc_found', 'loc_title', 'loc_authors', 'loc_year', 'loc_publisher',
        'loc_isbn', 'loc_issn', 'loc_pages'
    ])

# Sammle ISBN/ISSN aus englischsprachigen Kandidaten
print(f"\nüìã Extrahiere ISBN/ISSN aus {len(final_candidates):,} Kandidaten...")

queries_isbn = final_candidates[final_candidates['isbn'].notna()][['id', 'isbn']].copy()
queries_isbn.columns = ['vdeh_id', 'query_value']
queries_isbn['query_type'] = 'ISBN'

queries_issn = final_candidates[
    final_candidates['isbn'].isna() & final_candidates['issn'].notna()
][['id', 'issn']].copy()
queries_issn.columns = ['vdeh_id', 'query_value']
queries_issn['query_type'] = 'ISSN'

all_queries = pd.concat([queries_isbn, queries_issn], ignore_index=True)

print(f"   ISBN-Queries: {len(queries_isbn):,}")
print(f"   ISSN-Queries: {len(queries_issn):,}")
print(f"   Gesamt: {len(all_queries):,}")

# Filtere bereits abgefragte
if len(loc_data_df) > 0:
    already_queried = set(loc_data_df['query_value'])
    new_queries = all_queries[~all_queries['query_value'].isin(already_queried)].copy()
    
    print(f"\nüîç Abgleich mit vorhandenen Daten:")
    print(f"   Bereits vorhanden: {len(all_queries) - len(new_queries):,}")
    print(f"   Neu abzufragen: {len(new_queries):,}")
else:
    new_queries = all_queries
    print(f"\nüîç Alle {len(new_queries):,} Queries sind neu")

# Nur abfragen wenn neue Queries vorhanden
if len(new_queries) > 0:
    print(f"\nüîÑ Starte LoC-Abfrage f√ºr {len(new_queries):,} neue Queries...\n")
    
    from tqdm.auto import tqdm
    
    results = []
    stats = {'found': 0, 'not_found': 0}
    query_count = 0
    
    for _, row in tqdm(new_queries.iterrows(), total=len(new_queries), desc="üîç LoC API", unit="queries"):
        # API-Abfrage
        loc_result = None
        if row['query_type'] == 'ISBN':
            loc_result = query_loc_by_isbn(row['query_value'])
        elif row['query_type'] == 'ISSN':
            loc_result = query_loc_by_issn(row['query_value'])
        
        # Ergebnis speichern
        result_row = {
            'vdeh_id': row['vdeh_id'],
            'query_type': row['query_type'],
            'query_value': row['query_value'],
            'loc_found': loc_result is not None,
            'loc_title': loc_result.get('title') if loc_result else None,
            'loc_authors': ', '.join(loc_result.get('authors', [])) if loc_result else None,
            'loc_year': loc_result.get('year') if loc_result else None,
            'loc_publisher': loc_result.get('publisher') if loc_result else None,
            'loc_isbn': loc_result.get('isbn') if loc_result else None,
            'loc_issn': loc_result.get('issn') if loc_result else None,
            'loc_pages': loc_result.get('pages') if loc_result else None
        }
        
        results.append(result_row)
        
        if loc_result:
            stats['found'] += 1
        else:
            stats['not_found'] += 1
        
        query_count += 1
        
        # Regelm√§√üiges Speichern
        if query_count % SAVE_INTERVAL == 0:
            new_results_df = pd.DataFrame(results)
            loc_data_df = pd.concat([loc_data_df, new_results_df], ignore_index=True)
            loc_data_df.to_parquet(LOC_DATA_FILE, index=False)
            results = []
            print(f"üíæ Zwischenspeicherung: {query_count}/{len(new_queries)} Queries abgefragt")
        
        # Rate Limiting
        time.sleep(RATE_LIMIT_DELAY)
    
    # Finale Speicherung
    if len(results) > 0:
        new_results_df = pd.DataFrame(results)
        loc_data_df = pd.concat([loc_data_df, new_results_df], ignore_index=True)
        loc_data_df.to_parquet(LOC_DATA_FILE, index=False)
    
    print(f"\nüíæ LoC-Daten gespeichert: {LOC_DATA_FILE.name}")
    
    # Zusammenfassung
    print(f"\nüìä === NEUE ABFRAGEN ===")
    print(f"   Neue Queries: {len(new_queries):,}")
    print(f"   ‚úÖ Gefunden: {stats['found']:,} ({stats['found']/len(new_queries)*100:.1f}%)")
    print(f"   ‚ùå Nicht gefunden: {stats['not_found']:,} ({stats['not_found']/len(new_queries)*100:.1f}%)")

else:
    print(f"\n‚úÖ Alle ISBN/ISSN bereits in LoC-Daten vorhanden - keine neuen Abfragen n√∂tig")

# Gesamtstatistik
print(f"\nüìä === GESAMT LOC-DATEN ===")
print(f"   Total Records: {len(loc_data_df):,}")
print(f"   Erfolgreich: {(loc_data_df['loc_found'] == True).sum():,}")
print(f"   Nicht gefunden: {(loc_data_df['loc_found'] == False).sum():,}")

üöÄ === LOC DATENABFRAGE (ISBN/ISSN) ===

‚öôÔ∏è  Konfiguration:
   Rate Limit: 10.0s pro Anfrage
   Save Interval: Alle 50 Queries
   Output: loc_raw_data.parquet

üìÇ Lade vorhandene LoC-Daten...
   Bereits abgefragt: 3,785
   Davon erfolgreich: 797

üìã Extrahiere ISBN/ISSN aus 9,900 Kandidaten...
   ISBN-Queries: 3,695
   ISSN-Queries: 90
   Gesamt: 3,785

üîç Abgleich mit vorhandenen Daten:
   Bereits vorhanden: 3,785
   Neu abzufragen: 0

‚úÖ Alle ISBN/ISSN bereits in LoC-Daten vorhanden - keine neuen Abfragen n√∂tig

üìä === GESAMT LOC-DATEN ===
   Total Records: 3,785
   Erfolgreich: 797
   Nicht gefunden: 2,988


In [6]:
# üîç LOC TITEL/AUTOR-SUCHE
print("üîç === LOC TITEL/AUTOR-SUCHE ===\n")

LOC_TITLE_DATA_FILE = processed_dir / 'loc_title_author_data.parquet'
ALWAYS_TA_FOR_ALL_WITH_TITLE_AUTHORS = True

print(f"‚öôÔ∏è  Konfiguration:")
print(f"   Rate Limit: {RATE_LIMIT_DELAY}s pro Anfrage")
print(f"   Save Interval: Alle {SAVE_INTERVAL} Queries")
print(f"   Output: {LOC_TITLE_DATA_FILE.name}")
print(f"   TA f√ºr alle Titel+Autoren: {ALWAYS_TA_FOR_ALL_WITH_TITLE_AUTHORS}")

# Lade vorhandene Titel/Autor-Suchdaten
if LOC_TITLE_DATA_FILE.exists():
    print(f"\nüìÇ Lade vorhandene Titel/Autor-Suchdaten...")
    loc_title_df = pd.read_parquet(LOC_TITLE_DATA_FILE)
    print(f"   Bereits abgefragt: {len(loc_title_df):,}")
    print(f"   Davon erfolgreich: {(loc_title_df['loc_found'] == True).sum():,}")
else:
    print(f"\nüìÇ Keine vorhandenen Titel/Autor-Suchdaten gefunden")
    loc_title_df = pd.DataFrame(columns=[
        'vdeh_id', 'query_type', 'title', 'author',
        'loc_found', 'loc_title', 'loc_authors', 'loc_year', 'loc_publisher',
        'loc_isbn', 'loc_issn', 'loc_pages'
    ])

# Identifiziere Kandidaten (nur englischsprachig!)
if ALWAYS_TA_FOR_ALL_WITH_TITLE_AUTHORS:
    title_author_candidates = df_english[
        (df_english['title'].notna()) &
        (df_english['authors_str'].notna()) &
        (df_english['authors_str'] != '')
    ].copy()
else:
    title_author_candidates = df_english[
        (df_english['isbn'].isna()) &
        (df_english['issn'].isna()) &
        (df_english['title'].notna()) &
        (df_english['authors_str'].notna()) &
        (df_english['authors_str'] != '')
    ].copy()

print(f"\nüìã Titel/Autor-Kandidaten (englisch): {len(title_author_candidates):,}")

# Erstelle Query-Liste
title_queries = title_author_candidates[['id', 'title', 'authors_str']].copy()
title_queries.columns = ['vdeh_id', 'title', 'author']
title_queries['query_type'] = 'TITLE_AUTHOR'

# Filtere bereits abgefragte
if len(loc_title_df) > 0:
    already_queried = set(loc_title_df['vdeh_id'])
    new_title_queries = title_queries[~title_queries['vdeh_id'].isin(already_queried)].copy()
    
    print(f"\nüîç Abgleich mit vorhandenen Daten:")
    print(f"   Bereits vorhanden: {len(title_queries) - len(new_title_queries):,}")
    print(f"   Neu abzufragen: {len(new_title_queries):,}")
else:
    new_title_queries = title_queries
    print(f"\nüîç Alle {len(new_title_queries):,} Titel/Autor-Queries sind neu")

# Nur abfragen wenn neue Queries vorhanden
if len(new_title_queries) > 0:
    print(f"\nüîÑ Starte LoC Titel/Autor-Abfrage f√ºr {len(new_title_queries):,} neue Queries...\n")
    
    from tqdm.auto import tqdm
    
    results = []
    stats = {'found': 0, 'not_found': 0}
    query_count = 0
    
    for _, row in tqdm(new_title_queries.iterrows(), total=len(new_title_queries), desc="üîç LoC Titel/Autor", unit="queries"):
        # API-Abfrage
        loc_result = query_loc_by_title_author(row['title'], row['author'])
        
        # Ergebnis speichern
        result_row = {
            'vdeh_id': row['vdeh_id'],
            'query_type': row['query_type'],
            'title': row['title'],
            'author': row['author'],
            'loc_found': loc_result is not None,
            'loc_title': loc_result.get('title') if loc_result else None,
            'loc_authors': ', '.join(loc_result.get('authors', [])) if loc_result else None,
            'loc_year': loc_result.get('year') if loc_result else None,
            'loc_publisher': loc_result.get('publisher') if loc_result else None,
            'loc_isbn': loc_result.get('isbn') if loc_result else None,
            'loc_issn': loc_result.get('issn') if loc_result else None,
            'loc_pages': loc_result.get('pages') if loc_result else None
        }
        
        results.append(result_row)
        
        if loc_result:
            stats['found'] += 1
        else:
            stats['not_found'] += 1
        
        query_count += 1
        
        # Regelm√§√üiges Speichern
        if query_count % SAVE_INTERVAL == 0:
            new_results_df = pd.DataFrame(results)
            loc_title_df = pd.concat([loc_title_df, new_results_df], ignore_index=True)
            loc_title_df.to_parquet(LOC_TITLE_DATA_FILE, index=False)
            results = []
            
            current_rate = stats['found'] / query_count * 100
            print(f"üíæ Zwischenstand: {query_count}/{len(new_title_queries)} | Erfolgsrate: {current_rate:.1f}%")
        
        # Rate Limiting
        time.sleep(RATE_LIMIT_DELAY)
    
    # Finale Speicherung
    if len(results) > 0:
        new_results_df = pd.DataFrame(results)
        loc_title_df = pd.concat([loc_title_df, new_results_df], ignore_index=True)
        loc_title_df.to_parquet(LOC_TITLE_DATA_FILE, index=False)
    
    print(f"\nüíæ LoC Titel/Autor-Daten gespeichert: {LOC_TITLE_DATA_FILE.name}")
    
    print(f"\nüìä === NEUE TITEL/AUTOR-ABFRAGEN ===")
    print(f"   Neue Queries: {len(new_title_queries):,}")
    print(f"   ‚úÖ Gefunden: {stats['found']:,} ({stats['found']/len(new_title_queries)*100:.1f}%)")
    print(f"   ‚ùå Nicht gefunden: {stats['not_found']:,} ({stats['not_found']/len(new_title_queries)*100:.1f}%)")

else:
    print(f"\n‚úÖ Alle Titel/Autor-Kombinationen bereits abgefragt")

# Gesamtstatistik
print(f"\nüìä === GESAMT TITEL/AUTOR-DATEN ===")
print(f"   Total Records: {len(loc_title_df):,}")
print(f"   Erfolgreich: {(loc_title_df['loc_found'] == True).sum():,}")
print(f"   Nicht gefunden: {(loc_title_df['loc_found'] == False).sum():,}")
if len(loc_title_df) > 0:
    print(f"   üìà Erfolgsrate: {(loc_title_df['loc_found'] == True).sum()/len(loc_title_df)*100:.1f}%")

üîç === LOC TITEL/AUTOR-SUCHE ===

‚öôÔ∏è  Konfiguration:
   Rate Limit: 10.0s pro Anfrage
   Save Interval: Alle 50 Queries
   Output: loc_title_author_data.parquet
   TA f√ºr alle Titel+Autoren: True

üìÇ Lade vorhandene Titel/Autor-Suchdaten...
   Bereits abgefragt: 4,770
   Davon erfolgreich: 853

üìã Titel/Autor-Kandidaten (englisch): 4,770

üîç Abgleich mit vorhandenen Daten:
   Bereits vorhanden: 4,770
   Neu abzufragen: 0

‚úÖ Alle Titel/Autor-Kombinationen bereits abgefragt

üìä === GESAMT TITEL/AUTOR-DATEN ===
   Total Records: 4,770
   Erfolgreich: 853
   Nicht gefunden: 3,917
   üìà Erfolgsrate: 17.9%


In [7]:
# üîó LOC-DATEN MIT VDEH-DATEN ZUSAMMENF√úHREN
print("üîó === LOC-DATEN MERGE ===\n")

# Starte mit VDEH-Daten
df_enriched = df_vdeh.copy()

# 1. Merge ISBN/ISSN-basierte LoC-Daten
if len(loc_data_df) > 0:
    cols_to_merge = ['vdeh_id', 'query_type', 'loc_title', 'loc_authors', 'loc_year', 'loc_publisher']
    if 'loc_isbn' in loc_data_df.columns:
        cols_to_merge.extend(['loc_isbn', 'loc_issn', 'loc_pages'])
    
    loc_isbn_issn = loc_data_df[loc_data_df['loc_found'] == True][cols_to_merge].rename(
        columns={'query_type': 'loc_query_method'}
    )
    
    df_enriched = df_enriched.merge(
        loc_isbn_issn,
        left_on='id',
        right_on='vdeh_id',
        how='left',
        suffixes=('', '_dup')
    )
    if 'vdeh_id' in df_enriched.columns:
        df_enriched.drop(columns=['vdeh_id'], inplace=True)
    
    print(f"‚úÖ ISBN/ISSN-basierte LoC-Daten (ID) gemerged")
    print(f"   ID-Matches: {df_enriched['loc_query_method'].notna().sum():,}")

# 2. Merge Titel/Autor-basierte LoC-Daten als separate Variante (_ta)
if len(loc_title_df) > 0:
    cols_to_merge_ta = ['vdeh_id', 'loc_title', 'loc_authors', 'loc_year', 'loc_publisher']
    if 'loc_isbn' in loc_title_df.columns:
        cols_to_merge_ta.extend(['loc_isbn', 'loc_issn', 'loc_pages'])
    
    loc_title_matches = loc_title_df[loc_title_df['loc_found'] == True][cols_to_merge_ta].copy()
    
    rename_map = {
        'loc_title': 'loc_title_ta',
        'loc_authors': 'loc_authors_ta',
        'loc_year': 'loc_year_ta',
        'loc_publisher': 'loc_publisher_ta'
    }
    if 'loc_isbn' in cols_to_merge_ta:
        rename_map.update({
            'loc_isbn': 'loc_isbn_ta',
            'loc_issn': 'loc_issn_ta',
            'loc_pages': 'loc_pages_ta'
        })
    
    loc_title_matches = loc_title_matches.rename(columns=rename_map)
    
    df_enriched = df_enriched.merge(
        loc_title_matches,
        left_on='id',
        right_on='vdeh_id',
        how='left'
    )
    if 'vdeh_id' in df_enriched.columns:
        df_enriched.drop(columns=['vdeh_id'], inplace=True)
    
    print(f"‚úÖ Titel/Autor-basierte LoC-Daten (TA) gemerged")
    print(f"   TA-Matches: {df_enriched[['loc_title_ta','loc_authors_ta','loc_year_ta','loc_publisher_ta']].notna().any(axis=1).sum():,}")

# Zusammenfassung
print(f"\nüìä === MERGE ZUSAMMENFASSUNG ===")
print(f"   Total Records: {len(df_enriched):,}")
print(f"   Mit ID-LoC: {df_enriched['loc_query_method'].notna().sum() if 'loc_query_method' in df_enriched.columns else 0:,}")
print(f"   Mit TA-LoC: {df_enriched[['loc_title_ta','loc_authors_ta','loc_year_ta','loc_publisher_ta']].notna().any(axis=1).sum() if 'loc_title_ta' in df_enriched.columns else 0:,}")


üîó === LOC-DATEN MERGE ===



‚úÖ ISBN/ISSN-basierte LoC-Daten (ID) gemerged
   ID-Matches: 797


‚úÖ Titel/Autor-basierte LoC-Daten (TA) gemerged
   TA-Matches: 853

üìä === MERGE ZUSAMMENFASSUNG ===
   Total Records: 58,305
   Mit ID-LoC: 797
   Mit TA-LoC: 853


In [8]:
# üîß DATENTYP-NORMALISIERUNG
print("üîß === DATENTYP-NORMALISIERUNG ===\n")

# Konvertiere Jahr-Spalten zu Int64
year_columns = ['year', 'loc_year', 'loc_year_ta']

for col in year_columns:
    if col in df_enriched.columns:
        original_count = df_enriched[col].notna().sum()
        df_enriched[col] = pd.to_numeric(df_enriched[col], errors='coerce').astype('Int64')
        new_count = df_enriched[col].notna().sum()
        
        print(f"   {col}: {original_count:,} ‚Üí {new_count:,} (Int64)")
        
        if original_count != new_count:
            print(f"      ‚ö†Ô∏è  {original_count - new_count:,} Werte konnten nicht konvertiert werden")

print(f"\n‚úÖ Datentypen normalisiert")

üîß === DATENTYP-NORMALISIERUNG ===

   year: 33,313 ‚Üí 33,313 (Int64)
   loc_year: 311 ‚Üí 311 (Int64)
   loc_year_ta: 454 ‚Üí 454 (Int64)

‚úÖ Datentypen normalisiert


In [9]:
# üíæ DATEN SPEICHERNprint("üíæ === DATEN SPEICHERN ===\n")# Output-Pfadeoutput_path = processed_dir / '04b_loc_enriched_data.parquet'metadata_output = processed_dir / '04b_metadata.json'# 1. Parquet speicherndf_enriched.to_parquet(output_path, index=False)print(f"‚úÖ LoC-angereicherte Daten gespeichert: {output_path.name}")print(f"   Records: {len(df_enriched):,}")print(f"   Spalten: {len(df_enriched.columns)}")print(f"   Gr√∂√üe: {output_path.stat().st_size / 1024**2:.1f} MB")# 2. Metadaten erstellenmetadata = {    'step': '04b_loc_enrichment',    'input_file': '03_language_detected_data.parquet',    'output_file': '04b_loc_enriched_data.parquet',    'timestamp': pd.Timestamp.now().isoformat(),    'record_count': len(df_enriched),    'english_records': len(df_english),    'columns': list(df_enriched.columns),        'loc_queries': {        'isbn_issn': {            'total_queries': len(loc_data_df) if len(loc_data_df) > 0 else 0,            'successful': int((loc_data_df['loc_found'] == True).sum()) if len(loc_data_df) > 0 else 0,            'failed': int((loc_data_df['loc_found'] == False).sum()) if len(loc_data_df) > 0 else 0        },        'title_author': {            'total_queries': len(loc_title_df) if len(loc_title_df) > 0 else 0,            'successful': int((loc_title_df['loc_found'] == True).sum()) if len(loc_title_df) > 0 else 0,            'failed': int((loc_title_df['loc_found'] == False).sum()) if len(loc_title_df) > 0 else 0        }    },        'loc_variants': {        'id_available': int(df_enriched['loc_query_method'].notna().sum()) if 'loc_query_method' in df_enriched.columns else 0,        'ta_available': int(df_enriched[['loc_title_ta','loc_authors_ta','loc_year_ta','loc_publisher_ta']].notna().any(axis=1).sum()) if 'loc_title_ta' in df_enriched.columns else 0    }}# Metadaten speichernwith open(metadata_output, 'w', encoding='utf-8') as f:    json.dump(metadata, f, indent=2, ensure_ascii=False)print(f"\n‚úÖ Metadaten gespeichert: {metadata_output.name}")# 3. Zusammenfassungprint(f"\nüìä === LOC ENRICHMENT ABGESCHLOSSEN ===")print(f"   Input: {len(df_vdeh):,} VDEH Records")print(f"   Englischsprachig: {len(df_english):,}")print(f"   Output: {len(df_enriched):,} Records mit LoC-Daten")print(f"   ID-Variante verf√ºgbar: {metadata['loc_variants']['id_available']:,}")print(f"   TA-Variante verf√ºgbar: {metadata['loc_variants']['ta_available']:,}")print(f"\n‚û°Ô∏è  N√§chster Schritt: Integration mit DNB-Daten in 05_vdeh_data_fusion.ipynb")print(f"\nüéâ LoC Enrichment erfolgreich abgeschlossen!")