# 01 - Datenaufbereitung fuer Surrogat & Generator

Dieses Notebook fuehrt den vollstaendigen Vorverarbeitungs-Workflow fuer die
organischen Halbleiter-Daten aus:

1. Konfiguration laden und Datenquelle validieren
2. Rohdaten pruefen, bereinigen und dokumentieren
3. Splits sowie Normalisierung berechnen und speichern

> Ergebnisdateien landen in `data/processed` und werden von Training/Generierung verwendet.


In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import os
import sys

def _find_project_root(marker: str = "src") -> Path:
    candidates = []
    env_root = os.environ.get("PROJECT_ROOT")
    if env_root:
        candidates.append(Path(env_root).expanduser())

    try:
        notebook_path = Path(__vsc_ipynb_file__).resolve()  # type: ignore[name-defined]
        candidates.append(notebook_path.parent)
    except Exception:
        pass

    cwd = Path().resolve()
    candidates.append(cwd)
    candidates.extend(cwd.parents)

    for drive_letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
        drive_root = Path(f"{drive_letter}:/Ackern/BLLAmen")
        if drive_root.exists():
            candidates.append(drive_root)


    unique_candidates = []
    seen = set()
    for cand in candidates:
        if cand is None:
            continue
        try:
            resolved = cand.resolve()
        except FileNotFoundError:
            continue
        if resolved in seen:
            continue
        seen.add(resolved)
        unique_candidates.append(resolved)

    for base in unique_candidates:
        for candidate in [base, *base.parents]:
            if (candidate / marker).exists():
                return candidate

    raise RuntimeError(f"Could not locate project root containing {marker}/")

PROJECT_ROOT = _find_project_root()
SRC_PATH = PROJECT_ROOT / "src"
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
if str(SRC_PATH) not in sys.path:
    sys.path.insert(0, str(SRC_PATH))

import json
from typing import Sequence

import numpy as np
import pandas as pd
from rdkit import Chem
from IPython.display import display

import importlib
import src.data.dataset as dataset_module
dataset_module = importlib.reload(dataset_module)
from src.data.dataset import (
    load_dataframe,
    split_dataframe,
    compute_normalization,
    apply_normalization,
)
from src.data.featurization import mol_to_graph
from src.utils.config import load_config
from src.utils.log import setup_logging, get_logger
from src.utils.plot import plot_property_histogram

pd.set_option("display.max_columns", 40)
pd.set_option("display.precision", 4)

setup_logging()
logger = get_logger(__name__)


## 1. Konfiguration und Pfade

Wir lesen `train_conf.yaml`, passen den Datensatzpfad bei Bedarf an und legen
alle Ausgabeverzeichnisse an.


In [None]:
CONFIG_MAP = {
    'surrogate': PROJECT_ROOT / 'configs/train_conf.yaml',
    'generator': PROJECT_ROOT / 'configs/gen_conf.yaml',
}
CONFIG_KIND = os.environ.get('DATA_PREP_CONFIG', 'surrogate').lower()
if CONFIG_KIND not in CONFIG_MAP:
    raise KeyError(f"Unbekannte DATA_PREP_CONFIG '{CONFIG_KIND}'; verfuegbar: {list(CONFIG_MAP)}")
CONFIG_PATH = CONFIG_MAP[CONFIG_KIND]
config = load_config(CONFIG_PATH)
logger.info("Konfiguration geladen (%s): %s", CONFIG_KIND, CONFIG_PATH)

raw_path = (PROJECT_ROOT / config.dataset.path).resolve()
if not raw_path.exists():
    raise FileNotFoundError(f"Datensatz nicht gefunden: {raw_path}")
logger.info("Arbeite mit Datensatz: %s", raw_path)

## 2. Rohdaten laden

Wir lesen die Datei ein, validieren Pflichtspalten und schauen auf die ersten Zeilen.


In [None]:
raw_df = load_dataframe(raw_path)
logger.info("Rohdaten gelesen: %d Zeilen, %d Spalten", len(raw_df), raw_df.shape[1])

expected_cols = {"smiles"} | set(config.dataset.target_columns)
missing_cols = expected_cols.difference(raw_df.columns)
if missing_cols:
    raise ValueError(f"Rohdaten fehlen Spalten: {sorted(missing_cols)}")

raw_df = raw_df.loc[:, ["smiles"] + list(config.dataset.target_columns)].copy()
raw_df.head()


## 3. SMILES und Zielgroessen bereinigen

Wir entfernen invalide SMILES, Zeilen mit fehlenden Targets und Duplikate.


In [None]:
def sanitize_dataframe(df: pd.DataFrame, target_cols: Sequence[str]) -> pd.DataFrame:
    working = df.copy()
    working["smiles"] = working["smiles"].astype(str).str.strip()

    mols = working["smiles"].apply(Chem.MolFromSmiles)
    invalid_mask = mols.isna()
    if invalid_mask.any():
        logger.warning("Ignoriere %d invalide SMILES", int(invalid_mask.sum()))
    working = working[~invalid_mask].copy()

    missing_mask = working[target_cols].isna().any(axis=1)
    if missing_mask.any():
        logger.warning("Ignoriere %d Zeilen mit fehlenden Zielwerten", int(missing_mask.sum()))
    working = working[~missing_mask].copy()

    before = len(working)
    working = working.drop_duplicates(subset="smiles").reset_index(drop=True)
    removed = before - len(working)
    if removed:
        logger.info("Entfernte %d Duplikate anhand SMILES", removed)

    return working

clean_df = sanitize_dataframe(raw_df, config.dataset.target_columns)
logger.info("Bereinigte Tabelle: %d Zeilen", len(clean_df))
clean_df.head()


## 4. Zielgroessen analysieren

Deskriptive Statistik und Histogramme fuer die Targets.


In [None]:
summary = clean_df[config.dataset.target_columns].describe().T
summary


In [None]:
created_figures = {}
for target in config.dataset.target_columns:
    fig = plot_property_histogram(
        clean_df[target].to_numpy(dtype=float),
        title=f"Verteilung {target}",
        xlabel=f"{target} [eV]",
        save_path=str(PLOTS_DIR / f"{target.lower()}_hist.png"),
    )
    if fig is not None:
        created_figures[target] = fig
created_figures


## 5. Graph-Featurisierung testen

Mit Stichproben pruefen wir die Ausgabe von `mol_to_graph`.


In [None]:
sample = clean_df.sample(n=min(len(clean_df), 3), random_state=42)
graph_checks = []
for _, row in sample.iterrows():
    data = mol_to_graph(row["smiles"], y=row[config.dataset.target_columns].to_numpy(dtype=float))
    graph_checks.append(
        {
            "smiles": row["smiles"],
            "num_nodes": int(data.num_nodes),
            "num_edges": int(data.num_edges),
            "target_shape": tuple(data.y.shape) if data.y is not None else None,
        }
    )
graph_checks


## 6. Train/Val/Test Split speichern

Deterministischer Split mit den Helfern aus `src.data.dataset`.


In [None]:
split = split_dataframe(
    clean_df,
    val_fraction=float(config.dataset.val_fraction),
    test_fraction=0.1,
    seed=42,
)
split_sizes = {name: len(getattr(split, name)) for name in ("train", "val", "test")}
split_sizes


In [None]:
base_name = Path(config.dataset.path).stem
split_paths = {}
for key in ("train", "val", "test"):
    frame = getattr(split, key)
    if frame.empty:
        logger.warning("Split '%s' ist leer und wird nicht gespeichert.", key)
        continue
    out_path = PROCESSED_DIR / f"{base_name}_{key}.csv"
    frame.to_csv(out_path, index=False)
    split_paths[key] = str(out_path.relative_to(PROJECT_ROOT))
logger.info("Persistierte Splits: %s", split_paths)
split_paths


## 7. Normalisierung berechnen

Mean/Std werden fuer die spaetere Inferenz abgelegt.


In [None]:
if split.train.empty:
    raise RuntimeError("Train-Split ist leer; pruefe Datensatz oder Split-Parameter.")

norm_stats = compute_normalization(split.train, config.dataset.target_columns)
norm_payload = {
    "mean": norm_stats.mean.to_dict(),
    "std": norm_stats.std.to_dict(),
}
norm_path = PROCESSED_DIR / f"{base_name}_normalization.json"
with norm_path.open("w", encoding="utf-8") as fh:
    json.dump(norm_payload, fh, indent=2)

normalized_preview = apply_normalization(
    split.train.head(3),
    norm_stats,
    config.dataset.target_columns,
)
display(norm_payload)
normalized_preview


## 8. Zusammenfassung

- Rohdaten validiert und bereinigt
- Splits + Normalisierung in `data/processed` abgelegt
- Histogramme in `data/processed/figures` gespeichert

Weiter geht es mit `02_surrogate_training.ipynb` oder den Skripten in `src/`.
