In [1]:
from processing_pipeline import ProcessingPipeline

from features.spectral_indices import CalculateIndices
from features.basic_features import BasicFeatures
from features.temporal_features import TemporalFeatures

from processing_steps.interpolation import Interpolation
from processing_steps.data_augmentation import DataAugmentation
from processing_steps.adjust_labels import AdjustLabels
from processing_steps.aggregation import TimeSeriesAggregate
from processing_steps.interpolate_nans import InterpolateNaNs

from data_reduction.old_disturbance_pruner import OldDisturbancePruner
from data_reduction.detect_disturbed_trees import DetectDisturbedTrees
from data_reduction.timeseries_filter import TimeSeriesFilter

import pandas as pd

PATH = "../../../data/preprocessed/testset.csv"

In [2]:
test_set = pd.read_csv(PATH, parse_dates=["time"])

In [3]:
test_steps = [
    BasicFeatures(on=True),
    TimeSeriesAggregate(on=True, freq=2, method="mean"),    
    CalculateIndices(on=True),
    InterpolateNaNs(on=True, method="quadratic"),
    TemporalFeatures(on=True),
    Interpolation(on=True),
]

In [4]:
threshold = 150 # ids with size <150 will be augmented

train_steps = [
    TimeSeriesFilter(on=True),
    BasicFeatures(on=True),
    OldDisturbancePruner(on=True),
    CalculateIndices(on=True),
    DetectDisturbedTrees(on=True),
    AdjustLabels(on=True),
    DataAugmentation(on=True, threshold=threshold),
    TimeSeriesAggregate(on=True, freq=2, method="mean"),
    InterpolateNaNs(on=True, method="quadratic"),
    CalculateIndices(on=True), # Second time because of augmentation
    TemporalFeatures(on=True),  
    Interpolation(on=True),

]

pipeline = ProcessingPipeline(path=PATH, steps=train_steps)

df_processed = pipeline.run()
df_processed

0 ids have been removed due to predicted disturbance
Target number of IDs per species: 810


Augmenting species: 100%|██████████| 9/9 [03:06<00:00, 20.72s/it]


Unnamed: 0,time,b2,b3,b4,b5,b6,b7,b8,b8a,b11,...,id,species,disturbance_year,month_num,year,season,month_sin,month_cos,date_diff,is_growing_season
0,2016-12-26,,,,,,,,,,...,1,Norway_spruce_mixed,0.0,12,2016,1,-2.449294e-16,1.000000e+00,,0
1,2017-01-09,,,,,,,,,,...,1,Norway_spruce_mixed,0.0,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
2,2017-01-23,,,,,,,,,,...,1,Norway_spruce_mixed,0.0,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
3,2017-02-06,,,,,,,,,,...,1,Norway_spruce_mixed,0.0,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
4,2017-02-20,,,,,,,,,,...,1,Norway_spruce_mixed,0.0,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1548311,2022-09-12,,,808.443462,,,,,,,...,999_aug_3,soil,,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
1548312,2022-09-26,,,808.443462,,,,,,,...,999_aug_3,soil,,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
1548313,2022-10-10,,,808.443462,,,,,,,...,999_aug_3,soil,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1
1548314,2022-10-24,,,808.443462,,,,,,,...,999_aug_3,soil,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1


In [5]:
df_processed[df_processed["id"].str.contains("aug")]

Unnamed: 0,time,b2,b3,b4,b5,b6,b7,b8,b8a,b11,...,id,species,disturbance_year,month_num,year,season,month_sin,month_cos,date_diff,is_growing_season
616,2016-12-26,,,,,,,,,,...,10004_aug_1,Norway_spruce,,12,2016,1,-2.449294e-16,1.000000e+00,,0
617,2017-01-09,36.313282,117.654502,94.030435,227.770011,752.487143,1149.826260,1222.180474,1336.526119,276.750766,...,10004_aug_1,Norway_spruce,,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
618,2017-01-23,55.689641,139.982201,104.750511,249.457611,807.801039,1147.995634,1232.226424,1332.508676,319.840374,...,10004_aug_1,Norway_spruce,,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
619,2017-02-06,75.065999,162.309900,115.470586,271.145211,863.114935,1146.165008,1242.272375,1328.491233,362.929982,...,10004_aug_1,Norway_spruce,,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
620,2017-02-20,94.442358,184.637600,126.190662,292.832810,918.428832,1144.334382,1252.318325,1324.473790,406.019590,...,10004_aug_1,Norway_spruce,,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1548311,2022-09-12,,,808.443462,,,,,,,...,999_aug_3,soil,,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
1548312,2022-09-26,,,808.443462,,,,,,,...,999_aug_3,soil,,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
1548313,2022-10-10,,,808.443462,,,,,,,...,999_aug_3,soil,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1
1548314,2022-10-24,,,808.443462,,,,,,,...,999_aug_3,soil,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1


In [6]:
# df_processed.to_csv("../../../data/processed/testset_processed_notadjustlabels.csv", index=False)
# df_processed = pd.read_csv("../../../data/processed/testset_processed_notadjustlabels.csv", parse_dates=["time"])

In [7]:
test_set.groupby("species").size()

species
Norway_spruce          126620
Norway_spruce_mixed    225330
Scots_pine             179670
beech                   40916
disturbed               92661
oak                     13731
soil                   105606
dtype: int64

In [8]:
df_processed.groupby("species").size()

species
Norway_spruce              174790
Norway_spruce_disturbed    140140
Norway_spruce_mixed        244706
Scots_pine                 251174
Scots_pine_disturbed       142604
beech                      147532
oak                        134442
soil                       186494
soil_disturbed             126434
dtype: int64