# Predictions Validationset

In [1]:
import torch
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from models.lstm.lstm_utils.species_predictor import SpeciesPredictor
from models.lstm.lstm_utils.utility_functions import df_to_sequences
from models.lstm.lstm_utils.data_loader import CSVDataLoader
from pipelines.processing.processing_pipeline import ProcessingPipeline
from pipelines.processing.features.basic_features import BasicFeatures
from pipelines.processing.features.temporal_features import TemporalFeatures
from pipelines.processing.features.spectral_indices import CalculateIndices
from pipelines.processing.processing_steps.interpolation import Interpolation
from pipelines.processing.processing_steps.aggregation import TimeSeriesAggregate
from pipelines.processing.processing_steps.interpolate_nans import InterpolateNaNs
from pipelines.processing.processing_steps.smoothing import Smooth


RAW_NEW_PATH = "../../../data/val/FINAL_Validierungs_Datensatz.csv"
PROCESSED_NEW_PATH = "../../../data/val/val_processed.csv"
CHECKPOINT_PATH = "../../../data/lstm_training/results/epochepoch=89.ckpt"
ENCODER_DIR = Path("../../../data/lstm_training/results/encoders")
OUTPUT_PATH = "../../../data/val/val_predictions.csv"
data_loader = CSVDataLoader()

In [2]:
def preprocess_data(raw_path, processed_path):
    """Preprocess raw CSV and optionally sample a subset of IDs."""
    pipeline_steps = [
        BasicFeatures(on=True),
        TimeSeriesAggregate(on=True, freq=2, method="mean"),
        InterpolateNaNs(on=True, method="linear"),
        Smooth(on=True, overwrite=True),
        CalculateIndices(on=True),
        TemporalFeatures(on=True),
        Interpolation(on=True),
    ]

    pipeline = ProcessingPipeline(path=raw_path, steps=pipeline_steps)
    df_processed = pipeline.run()

    df_processed.to_csv(processed_path, index=False)
    print(f"Processed data saved ({df_processed.shape})")
    return df_processed

df_processed = preprocess_data(RAW_NEW_PATH, PROCESSED_NEW_PATH)
df_processed

(2183037, 13)
Processed data saved ((2342956, 36))


Unnamed: 0,time,id,b2,b3,b4,b5,b6,b7,b8,b8a,...,mtci,rendvi,month_num,year,season,biweek_of_year,biweek_sin,biweek_cos,date_diff,is_growing_season
0,2016-12-26,3,,,,,,,,,...,,,12,2016,1,27,0.239316,0.970942,,0
1,2017-01-09,3,,,,,,,,,...,,,1,2017,1,2,0.464723,0.885456,14.0,0
2,2017-01-23,3,,,,,,,,,...,,,1,2017,1,3,0.663123,0.748511,14.0,0
3,2017-02-06,3,,,,,,,,,...,,,2,2017,1,4,0.822984,0.568065,14.0,0
4,2017-02-20,3,,,,,,,,,...,,,2,2017,1,5,0.935016,0.354605,14.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2342951,2022-09-12,83996,113.250000,218.833333,123.416667,340.333333,1180.583333,1605.000000,1702.583333,1706.416667,...,948.708333,0.650103,9,2022,4,19,-0.992709,-0.120537,14.0,1
2342952,2022-09-26,83996,89.250000,181.000000,97.750000,291.000000,1066.250000,1486.500000,1524.750000,1568.250000,...,871.875000,0.672574,9,2022,4,20,-0.992709,0.120537,14.0,1
2342953,2022-10-10,83996,77.250000,156.000000,75.916667,260.500000,986.416667,1324.333333,1346.416667,1470.750000,...,818.208333,0.671259,10,2022,4,21,-0.935016,0.354605,14.0,1
2342954,2022-10-24,83996,86.500000,149.333333,72.333333,253.166667,902.333333,1243.500000,1249.166667,1408.000000,...,739.583333,0.661693,10,2022,4,22,-0.822984,0.568065,14.0,1


In [3]:
df_processed = data_loader.load_transform(PROCESSED_NEW_PATH)
cols_to_float = ["biweek_of_year", "is_growing_season", "month_num", "season"]
df_processed[cols_to_float] = df_processed[cols_to_float].astype(float)

In [4]:
enc_files = {
    "label_encoder": "label_encoder.pkl",
    "scaler": "scaler.pkl",
    "feature_columns": "feature_columns.pkl"
}

label_encoder, scaler, feature_columns = (
    pickle.load(open(ENCODER_DIR / fname, "rb")) for fname in enc_files.values()
)

print(label_encoder.classes_)
print(len(feature_columns))
print(feature_columns)
print(df_processed.columns)

['Norway_spruce' 'Norway_spruce_mixed' 'Scots_pine' 'beech' 'disturbed'
 'oak' 'soil']
72
['b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8', 'b8a', 'b11', 'b12', 'ndvi', 'gndvi', 'wdvi', 'tndvi', 'savi', 'ipvi', 'mcari', 'reip', 'masvi2', 'dvi', 'ndmi', 'nbr', 'ndwi', 'mtci', 'rendvi', 'biweek_sin', 'biweek_cos', 'season_1.0', 'season_2.0', 'season_3.0', 'season_4.0', 'is_growing_season_0.0', 'is_growing_season_1.0', 'month_num_1.0', 'month_num_2.0', 'month_num_3.0', 'month_num_4.0', 'month_num_5.0', 'month_num_6.0', 'month_num_7.0', 'month_num_8.0', 'month_num_9.0', 'month_num_10.0', 'month_num_11.0', 'month_num_12.0', 'biweek_of_year_1.0', 'biweek_of_year_2.0', 'biweek_of_year_3.0', 'biweek_of_year_4.0', 'biweek_of_year_5.0', 'biweek_of_year_6.0', 'biweek_of_year_7.0', 'biweek_of_year_8.0', 'biweek_of_year_9.0', 'biweek_of_year_10.0', 'biweek_of_year_11.0', 'biweek_of_year_12.0', 'biweek_of_year_13.0', 'biweek_of_year_14.0', 'biweek_of_year_15.0', 'biweek_of_year_16.0', 'biweek_of_year_17.0'

In [5]:
exclude_columns = [
    "time",
    "id",
    "disturbance_year",
    "is_disturbed",
    "date_diff",
    "year",
    "doy",
]

categorical_cols = ["season", "is_growing_season", "month_num", "biweek_of_year"]
categorical_cols = [c for c in categorical_cols if c in df_processed.columns]

df_encoded = pd.get_dummies(df_processed, columns=categorical_cols)

feature_columns = [c for c in df_encoded.columns if c not in exclude_columns]

df_encoded[feature_columns] = scaler.transform(df_encoded[feature_columns])
df_encoded = df_encoded.drop(
    columns=["time", "year", "date_diff", "is_disturbed"], axis=1
)
print(f"Anzahl Feature Columns: {len(df_encoded.columns)}")
df_encoded

Anzahl Feature Columns: 73


Unnamed: 0,id,b2,b3,b4,b5,b6,b7,b8,b8a,b11,...,biweek_of_year_18.0,biweek_of_year_19.0,biweek_of_year_20.0,biweek_of_year_21.0,biweek_of_year_22.0,biweek_of_year_23.0,biweek_of_year_24.0,biweek_of_year_25.0,biweek_of_year_26.0,biweek_of_year_27.0
0,3,-1.231138,-1.614885,-1.146575,-1.855265,-2.365447,-2.333896,-2.332886,-2.422894,-1.764417,...,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.183186,-0.183186,-0.183186,5.458938
1,3,-1.231138,-1.614885,-1.146575,-1.855265,-2.365447,-2.333896,-2.332886,-2.422894,-1.764417,...,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.183186,-0.183186,-0.183186,-0.183186
2,3,-1.231138,-1.614885,-1.146575,-1.855265,-2.365447,-2.333896,-2.332886,-2.422894,-1.764417,...,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.183186,-0.183186,-0.183186,-0.183186
3,3,-1.231138,-1.614885,-1.146575,-1.855265,-2.365447,-2.333896,-2.332886,-2.422894,-1.764417,...,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.183186,-0.183186,-0.183186,-0.183186
4,3,-1.231138,-1.614885,-1.146575,-1.855265,-2.365447,-2.333896,-2.332886,-2.422894,-1.764417,...,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.201347,-0.183186,-0.183186,-0.183186,-0.183186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2342951,83996,-0.684310,-0.728809,-0.700750,-0.845614,-0.562669,-0.333268,-0.339042,-0.460298,-0.862105,...,-0.201347,4.966555,-0.201347,-0.201347,-0.201347,-0.201347,-0.183186,-0.183186,-0.183186,-0.183186
2342952,83996,-0.800194,-0.882000,-0.793467,-0.991969,-0.737258,-0.480978,-0.547297,-0.619208,-1.032261,...,-0.201347,-0.201347,4.966555,-0.201347,-0.201347,-0.201347,-0.183186,-0.183186,-0.183186,-0.183186
2342953,83996,-0.858136,-0.983227,-0.872337,-1.082452,-0.859166,-0.683118,-0.756138,-0.731345,-1.148342,...,-0.201347,-0.201347,-0.201347,4.966555,-0.201347,-0.201347,-0.183186,-0.183186,-0.183186,-0.183186
2342954,83996,-0.813473,-1.010221,-0.885281,-1.104207,-0.987563,-0.783876,-0.870024,-0.803515,-1.165329,...,-0.201347,-0.201347,-0.201347,-0.201347,4.966555,-0.201347,-0.183186,-0.183186,-0.183186,-0.183186


In [6]:
new_sequences = df_to_sequences(df_encoded, feature_columns)
X_tensor = torch.tensor(new_sequences, dtype=torch.float32)

  X_tensor = torch.tensor(new_sequences, dtype=torch.float32)


In [7]:
model = SpeciesPredictor.load_from_checkpoint(
    CHECKPOINT_PATH,
    n_features=len(feature_columns),
    n_classes=len(label_encoder.classes_),
    lr=1e-3,
)
model.eval()

SpeciesPredictor(
  (model): ImprovedSequenceModel(
    (lstm): LSTM(72, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
    (fc): Sequential(
      (0): Linear(in_features=512, out_features=128, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.3, inplace=False)
      (3): Linear(in_features=128, out_features=7, bias=True)
    )
  )
  (criterion): CrossEntropyLoss()
  (train_acc): MulticlassAccuracy()
  (val_acc): MulticlassAccuracy()
  (test_acc): MulticlassAccuracy()
)

In [8]:
with torch.no_grad():
    outputs = model(X_tensor)
    preds = torch.argmax(outputs, dim=1).numpy()

pred_labels = label_encoder.inverse_transform(preds)

In [9]:
ids = df_encoded["id"].unique() 
print(len(ids), len(pred_labels))
pred_df = pd.DataFrame({"id": ids, "predicted_species": pred_labels})
df_with_preds = df_processed.merge(pred_df, on="id", how="left")
df_with_preds.to_csv(OUTPUT_PATH, index=False)
counts_per_id = df_with_preds.groupby("id")["predicted_species"].value_counts().sort_index()
print(counts_per_id)

15214 15214
id     predicted_species  
3      soil                   154
6      soil                   154
9      soil                   154
13     soil                   154
21     soil                   154
                             ... 
83978  Norway_spruce_mixed    154
83986  Norway_spruce          154
83990  disturbed              154
83992  Norway_spruce          154
83996  Norway_spruce_mixed    154
Name: count, Length: 15214, dtype: int64
