In [1]:
from processing_pipeline import ProcessingPipeline

from features.spectral_indices import CalculateIndices
from features.basic_features import BasicFeatures
from features.temporal_features import TemporalFeatures

from processing_steps.interpolation import Interpolation
from processing_steps.detect_disturbed_trees import DetectDisturbedTrees
from processing_steps.data_augmentation import DataAugmentation
from processing_steps.adjust_labels import AdjustLabels
from processing_steps.aggregation import TimeSeriesAggregate
from processing_steps.interpolate_nans import InterpolateNaNs

from data_reduction.old_disturbance_pruner import OldDisturbancePruner

import pandas as pd

In [2]:
PATH = "../../../data/preprocessed/testset.csv"

In [3]:
test_set = pd.read_csv(PATH, parse_dates=["time"])
test_set

Unnamed: 0,time,id,disturbance_year,doy,b2,b3,b4,b5,b6,b7,b8,b8a,b11,b12,species
0,2017-03-13,1,0.0,72.0,147.0,204.0,145.0,400.0,1193.0,1435.0,1593.0,1642.0,672.0,347.0,Norway_spruce_mixed
1,2017-03-27,1,0.0,86.0,137.0,222.0,154.0,389.0,1247.0,1561.0,1557.0,1747.0,743.0,388.0,Norway_spruce_mixed
2,2017-04-09,1,0.0,99.0,160.0,248.0,169.0,395.0,1209.0,1454.0,1530.0,1786.0,674.0,327.0,Norway_spruce_mixed
3,2017-04-22,1,0.0,112.0,146.0,247.0,152.0,391.0,1354.0,1628.0,1672.0,1878.0,673.0,305.0,Norway_spruce_mixed
4,2017-04-29,1,0.0,119.0,171.0,264.0,186.0,419.0,1353.0,1731.0,1806.0,1938.0,743.0,325.0,Norway_spruce_mixed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
784529,2022-08-12,28201,0.0,224.0,154.0,292.0,166.0,445.0,1584.0,1792.0,2077.0,2051.0,737.0,333.0,Norway_spruce
784530,2022-08-17,28201,0.0,229.0,151.0,313.0,183.0,440.0,1632.0,1990.0,2255.0,2248.0,873.0,377.0,Norway_spruce
784531,2022-08-27,28201,0.0,239.0,122.0,272.0,138.0,374.0,1426.0,1856.0,2035.0,1859.0,642.0,243.0,Norway_spruce
784532,2022-09-06,28201,0.0,249.0,116.0,241.0,119.0,357.0,1275.0,1616.0,1970.0,1997.0,586.0,235.0,Norway_spruce


In [4]:
test_steps = [
    BasicFeatures(on=True),
    TimeSeriesAggregate(on=True, freq=2, method="mean"),    
    CalculateIndices(on=True),
    InterpolateNaNs(on=True, method="quadratic"),
    TemporalFeatures(on=True),
    Interpolation(on=True),
]

In [None]:
# & TODO: df.time.min() for aggregation in Augmentation & TimeSeriesAggregate (0r maybe not needed?)

threshold = 150 # ids with size <150 will be augmented

train_steps = [
    BasicFeatures(on=True), # TODO: in BasicFeatures entfernen dass sample gemacht wird
    OldDisturbancePruner(on=True),
    DataAugmentation(on=True, threshold=threshold),
    TimeSeriesAggregate(on=True, freq=2, method="mean"),    
    InterpolateNaNs(on=True, method="quadratic"),
    CalculateIndices(on=True),
    TemporalFeatures(on=True),
    Interpolation(on=True),
    DetectDisturbedTrees(on=True),
    AdjustLabels(on=False),
]

pipeline = ProcessingPipeline(path=PATH, steps=train_steps)

df_processed = pipeline.run()
df_processed

# TODO: class for order of columns 
# TODO: genauer analysieren, wie ids sich nach augmentation verändern 
# -> detect disturbed trees & adjust labels kontrollieren wegen augmentation
# disturbance_year bleibt nur != null, wenn nicht augmentiert wurde? (also size > threshold)

Target number of IDs per species: 883


Augmenting species: 100%|██████████| 7/7 [03:30<00:00, 30.04s/it]


Unnamed: 0,time,b2,b3,b4,b5,b6,b7,b8,b8a,b11,...,mtci,rendvi,month_num,year,season,month_sin,month_cos,date_diff,is_growing_season,is_disturbed
0,2016-12-26,,,,,,,,,,...,,,12,2016,1,-2.449294e-16,1.000000e+00,,0,True
1,2017-01-09,,,,,,,,,,...,,,1,2017,1,5.000000e-01,8.660254e-01,14.0,0,True
2,2017-01-23,,,,,,,,,,...,,,1,2017,1,5.000000e-01,8.660254e-01,14.0,0,True
3,2017-02-06,,,,,,,,,,...,,,2,2017,1,8.660254e-01,5.000000e-01,14.0,0,True
4,2017-02-20,,,,,,,,,,...,,,2,2017,1,8.660254e-01,5.000000e-01,14.0,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
858083,2022-09-12,,,806.101214,,,,,,,...,,,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1,True
858084,2022-09-26,,,806.101214,,,,,,,...,,,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1,True
858085,2022-10-10,,,806.101214,,,,,,,...,,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1,True
858086,2022-10-24,,,806.101214,,,,,,,...,,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1,True


In [6]:
df_processed.groupby("species").size()

species
Norway_spruce                      1694
Norway_spruce_disturbed          151382
Norway_spruce_mixed               13552
Norway_spruce_mixed_disturbed    242396
Scots_pine                        89936
Scots_pine_disturbed              93786
beech                              9086
beech_disturbed                   36344
disturbed_disturbed               94556
oak                                7392
oak_disturbed                      6930
soil                              21868
soil_disturbed                    89166
dtype: int64