In [2]:
from pathlib import Path
import pandas as pd
import torch
import pickle
from models.lstm.lstm_utils.species_predictor import SpeciesPredictor
from models.lstm.lstm_utils.utility_functions import df_to_sequences

# === Importiere deine Processing-Klassen ===
from pipelines.processing.processing_pipeline import ProcessingPipeline
from pipelines.processing.features.basic_features import BasicFeatures
from pipelines.processing.features.temporal_features import TemporalFeatures
from pipelines.processing.features.spectral_indices import CalculateIndices
from pipelines.processing.processing_steps.interpolation import Interpolation
from pipelines.processing.processing_steps.aggregation import TimeSeriesAggregate
from pipelines.processing.processing_steps.interpolate_nans import InterpolateNaNs
from pipelines.processing.processing_steps.smoothing import Smooth





RAW_NEW_PATH = "../../../data/val/FINAL_Validierungs_Datensatz.csv"
PROCESSED_NEW_PATH = "../../../data/val/val_processed.csv"
CHECKPOINT_PATH = "../../../data/lstm_training/results/epochepoch=89.ckpt"
ENCODER_DIR = Path("../../../data/lstm_training/results/encoders")
OUTPUT_PATH = "../../../data/val/val_predictions.csv"

In [None]:
# # === 1️⃣ Dieselben Preprocessing-Steps wie für Testdaten ===
# test_steps = [
#     BasicFeatures(on=True),
#     TimeSeriesAggregate(on=True, freq=2, method="mean"),
#     InterpolateNaNs(on=True, method="linear"),
#     Smooth(on=True, overwrite=True),
#     CalculateIndices(on=True),
#     TemporalFeatures(on=True),
#     Interpolation(on=True),
# ]

# print("→ Running preprocessing on new data...")
# pipeline = ProcessingPipeline(path=RAW_NEW_PATH, steps=test_steps)
# df_processed = pipeline.run()
# df_processed.to_csv(PROCESSED_NEW_PATH, index=False)
# print(f"✓ Saved processed new data ({df_processed.shape})")


→ Running preprocessing on new data...
(2183037, 13)
✓ Saved processed new data ((2342956, 36))


In [None]:
df_processed = pd.read_csv(PROCESSED_NEW_PATH)

In [5]:

# === 2️⃣ Lade Encoder, Scaler, Feature Columns ===
with open(ENCODER_DIR / "label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

with open(ENCODER_DIR / "scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

with open(ENCODER_DIR / "feature_columns.pkl", "rb") as f:
    feature_columns = pickle.load(f)

print(label_encoder.classes_)
print(f"✓ Loaded encoders & feature columns ({len(feature_columns)} features)")
feature_columns

['Norway_spruce' 'Norway_spruce_mixed' 'Scots_pine' 'beech' 'disturbed'
 'oak' 'soil']
✓ Loaded encoders & feature columns (72 features)


['b2',
 'b3',
 'b4',
 'b5',
 'b6',
 'b7',
 'b8',
 'b8a',
 'b11',
 'b12',
 'ndvi',
 'gndvi',
 'wdvi',
 'tndvi',
 'savi',
 'ipvi',
 'mcari',
 'reip',
 'masvi2',
 'dvi',
 'ndmi',
 'nbr',
 'ndwi',
 'mtci',
 'rendvi',
 'biweek_sin',
 'biweek_cos',
 'season_1.0',
 'season_2.0',
 'season_3.0',
 'season_4.0',
 'is_growing_season_0.0',
 'is_growing_season_1.0',
 'month_num_1.0',
 'month_num_2.0',
 'month_num_3.0',
 'month_num_4.0',
 'month_num_5.0',
 'month_num_6.0',
 'month_num_7.0',
 'month_num_8.0',
 'month_num_9.0',
 'month_num_10.0',
 'month_num_11.0',
 'month_num_12.0',
 'biweek_of_year_1.0',
 'biweek_of_year_2.0',
 'biweek_of_year_3.0',
 'biweek_of_year_4.0',
 'biweek_of_year_5.0',
 'biweek_of_year_6.0',
 'biweek_of_year_7.0',
 'biweek_of_year_8.0',
 'biweek_of_year_9.0',
 'biweek_of_year_10.0',
 'biweek_of_year_11.0',
 'biweek_of_year_12.0',
 'biweek_of_year_13.0',
 'biweek_of_year_14.0',
 'biweek_of_year_15.0',
 'biweek_of_year_16.0',
 'biweek_of_year_17.0',
 'biweek_of_year_18.0',
 'b

In [6]:
df_processed.columns

Index(['time', 'id', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8', 'b8a', 'b11',
       'b12', 'is_disturbed', 'ndvi', 'gndvi', 'wdvi', 'tndvi', 'savi', 'ipvi',
       'mcari', 'reip', 'masvi2', 'dvi', 'ndmi', 'nbr', 'ndwi', 'mtci',
       'rendvi', 'month_num', 'year', 'season', 'biweek_of_year', 'biweek_sin',
       'biweek_cos', 'date_diff', 'is_growing_season'],
      dtype='object')

In [12]:
exclude_columns = [
    "time", "id", "disturbance_year", "is_disturbed",
    "date_diff", "year", "doy"
]

categorical_cols = ["season", "is_growing_season", "month_num", "biweek_of_year"]
categorical_cols = [c for c in categorical_cols if c in df_processed.columns]

df_encoded = pd.get_dummies(df_processed, columns=categorical_cols)
feature_columns = [
    c for c in df_encoded.columns
    if c not in exclude_columns
]

df_encoded[feature_columns] = scaler.fit_transform(df_encoded[feature_columns])
df_encoded = df_encoded.drop(columns=["time","year","date_diff","is_disturbed" ], axis=1)
print("\n✅ Transformation abgeschlossen!")
print(f"Anzahl Feature Columns: {len(df_encoded.columns)}")
df_encoded


✅ Transformation abgeschlossen!
Anzahl Feature Columns: 73


Unnamed: 0,id,b2,b3,b4,b5,b6,b7,b8,b8a,b11,...,biweek_of_year_18,biweek_of_year_19,biweek_of_year_20,biweek_of_year_21,biweek_of_year_22,biweek_of_year_23,biweek_of_year_24,biweek_of_year_25,biweek_of_year_26,biweek_of_year_27
0,3,,,,,,,,,,...,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.183186,-0.183186,-0.183186,5.458938
1,3,,,,,,,,,,...,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.183186,-0.183186,-0.183186,-0.183186
2,3,,,,,,,,,,...,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.183186,-0.183186,-0.183186,-0.183186
3,3,,,,,,,,,,...,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.183186,-0.183186,-0.183186,-0.183186
4,3,,,,,,,,,,...,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.183186,-0.183186,-0.183186,-0.183186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2342951,83996,-0.756825,-0.816457,-0.753256,-0.949713,-0.671991,-0.430938,-0.440882,-0.576141,-0.946861,...,-0.201347,4.966555,-0.201347,-0.201347,-0.201347,-0.201347,-0.183186,-0.183186,-0.183186,-0.183186
2342952,83996,-0.877214,-0.976451,-0.845662,-1.100717,-0.855977,-0.584586,-0.657020,-0.742423,-1.115028,...,-0.201347,-0.201347,4.966555,-0.201347,-0.201347,-0.201347,-0.183186,-0.183186,-0.183186,-0.183186
2342953,83996,-0.937409,-1.082174,-0.924267,-1.194074,-0.984445,-0.794854,-0.873765,-0.859763,-1.229751,...,-0.201347,-0.201347,-0.201347,4.966555,-0.201347,-0.201347,-0.183186,-0.183186,-0.183186,-0.183186
2342954,83996,-0.891009,-1.110367,-0.937168,-1.216520,-1.119752,-0.899663,-0.991962,-0.935282,-1.246540,...,-0.201347,-0.201347,-0.201347,-0.201347,4.966555,-0.201347,-0.183186,-0.183186,-0.183186,-0.183186


In [13]:
new_sequences = df_to_sequences(df_encoded, feature_columns)

In [None]:
X_sequences = [x for x, _ in new_sequences]  
X_tensor = torch.tensor(X_sequences, dtype=torch.float32)

In [None]:

# === 5️⃣ Modell laden ===
model = SpeciesPredictor.load_from_checkpoint(
    CHECKPOINT_PATH,
    n_features=len(feature_columns),
    n_classes=len(label_encoder.classes_),
    lr=1e-3,
    class_weights=None,
)
model.eval()


ValueError: too many values to unpack (expected 2)

In [None]:

# === 6️⃣ Vorhersage ===
with torch.no_grad():
    outputs = model(X_tensor)
    preds = torch.argmax(outputs, dim=1).numpy()

# === 7️⃣ Label zurückübersetzen ===
pred_labels = label_encoder.inverse_transform(preds)
df_processed["predicted_species"] = pred_labels

# === 8️⃣ Ergebnisse speichern ===
df_processed.to_csv(OUTPUT_PATH, index=False)
print(f"✅ Predictions saved to: {OUTPUT_PATH}")
print(df_processed[["predicted_species"]].value_counts())