In [None]:
from processing_pipeline import ProcessingPipeline

from features.spectral_indices import CalculateIndices
from features.basic_features import BasicFeatures
from features.temporal_features import TemporalFeatures

from processing_steps.interpolation import Interpolation
from processing_steps.data_augmentation import DataAugmentation
from processing_steps.adjust_labels import AdjustLabels
from processing_steps.aggregation import TimeSeriesAggregate
from processing_steps.interpolate_nans import InterpolateNaNs

from data_reduction.old_disturbance_pruner import OldDisturbancePruner
from data_reduction.detect_disturbed_trees import DetectDisturbedTrees

import pandas as pd

PATH = "../../../data/preprocessed/testset.csv"

In [9]:
test_set = pd.read_csv(PATH, parse_dates=["time"])

In [10]:
test_steps = [
    BasicFeatures(on=True),
    TimeSeriesAggregate(on=True, freq=2, method="mean"),    
    CalculateIndices(on=True),
    InterpolateNaNs(on=True, method="quadratic"),
    TemporalFeatures(on=True),
    Interpolation(on=True),
]

In [23]:
# TODO: not useful ids in preprocessing entfernen 

threshold = 150 # ids with size <150 will be augmented

train_steps = [
    BasicFeatures(on=True),
    OldDisturbancePruner(on=True),
    CalculateIndices(on=True),
    DetectDisturbedTrees(on=True),
    AdjustLabels(on=True),
    DataAugmentation(on=True, threshold=threshold),
    TimeSeriesAggregate(on=True, freq=2, method="mean"),  # delete here disturbance_year in class code
    InterpolateNaNs(on=True, method="quadratic"),
    CalculateIndices(on=True), # Second time because of augmentation
    TemporalFeatures(on=True),  
    Interpolation(on=True),

]

pipeline = ProcessingPipeline(path=PATH, steps=train_steps)

df_processed = pipeline.run()
df_processed

# TODO: class for order of columns 
# TODO: test AdjustLabels

Confusion Matrix:
 [[1473    1]
 [  12  186]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      1474
           1       0.99      0.94      0.97       198

    accuracy                           0.99      1672
   macro avg       0.99      0.97      0.98      1672
weighted avg       0.99      0.99      0.99      1672

1 ids have been removed due to predicted disturbance
Confusion Matrix:
[[294   5]
 [  1 357]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       299
           1       0.99      1.00      0.99       358

    accuracy                           0.99       657
   macro avg       0.99      0.99      0.99       657
weighted avg       0.99      0.99      0.99       657

Target number of IDs per species: 883


Augmenting species: 100%|██████████| 9/9 [02:35<00:00, 17.27s/it]


Unnamed: 0,time,b2,b3,b4,b5,b6,b7,b8,b8a,b11,...,id,species,disturbance_year,month_num,year,season,month_sin,month_cos,date_diff,is_growing_season
0,2016-12-26,,,,,,,,,,...,1,Norway_spruce_mixed,0.0,12,2016,1,-2.449294e-16,1.000000e+00,,0
1,2017-01-09,,,,,,,,,,...,1,Norway_spruce_mixed,0.0,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
2,2017-01-23,,,,,,,,,,...,1,Norway_spruce_mixed,0.0,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
3,2017-02-06,,,,,,,,,,...,1,Norway_spruce_mixed,0.0,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
4,2017-02-20,,,,,,,,,,...,1,Norway_spruce_mixed,0.0,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1649489,2022-09-12,,,799.05083,,,,,,,...,999aug4,soil,,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
1649490,2022-09-26,,,799.05083,,,,,,,...,999aug4,soil,,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
1649491,2022-10-10,,,799.05083,,,,,,,...,999aug4,soil,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1
1649492,2022-10-24,,,799.05083,,,,,,,...,999aug4,soil,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1


In [24]:
df_processed[df_processed["id"].str.contains("aug")]

Unnamed: 0,time,b2,b3,b4,b5,b6,b7,b8,b8a,b11,...,id,species,disturbance_year,month_num,year,season,month_sin,month_cos,date_diff,is_growing_season
1386,2016-12-26,,,,,,,,,,...,10031aug1,Norway_spruce,,12,2016,1,-2.449294e-16,1.000000e+00,,0
1387,2017-01-09,101.027268,42.767919,-18.270124,191.705097,668.191326,724.722801,840.618621,967.920218,194.577183,...,10031aug1,Norway_spruce,,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
1388,2017-01-23,103.209488,68.444874,3.327448,219.733571,739.993912,828.301645,938.107962,1048.742306,240.062042,...,10031aug1,Norway_spruce,,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
1389,2017-02-06,105.391708,94.121828,24.925020,247.762044,811.796498,931.880490,1035.597304,1129.564394,285.546901,...,10031aug1,Norway_spruce,,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
1390,2017-02-20,107.573927,119.798783,46.522592,275.790517,883.599084,1035.459334,1133.086645,1210.386482,331.031760,...,10031aug1,Norway_spruce,,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1649489,2022-09-12,,,799.050830,,,,,,,...,999aug4,soil,,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
1649490,2022-09-26,,,799.050830,,,,,,,...,999aug4,soil,,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
1649491,2022-10-10,,,799.050830,,,,,,,...,999aug4,soil,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1
1649492,2022-10-24,,,799.050830,,,,,,,...,999aug4,soil,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1


In [12]:
# df_processed.to_csv("../../../data/processed/testset_processed_notadjustlabels.csv", index=False)
# df_processed = pd.read_csv("../../../data/processed/testset_processed_notadjustlabels.csv", parse_dates=["time"])

In [25]:
test_set.groupby("species").size()

species
Norway_spruce          126620
Norway_spruce_mixed    225330
Scots_pine             179670
beech                   40916
disturbed               92661
oak                     13731
soil                   105606
dtype: int64

In [26]:
df_processed.groupby("species").size()

species
Norway_spruce              186032
Norway_spruce_disturbed    151844
Norway_spruce_mixed        255948
Scots_pine                 262416
Scots_pine_disturbed       153384
beech                      158774
oak                        145684
soil                       197736
soil_disturbed             137676
dtype: int64