In [1]:
from processing_pipeline import ProcessingPipeline

from features.spectral_indices import CalculateIndices
from features.basic_features import BasicFeatures
from features.temporal_features import TemporalFeatures

from processing_steps.interpolation import Interpolation
from processing_steps.data_augmentation import DataAugmentation
from processing_steps.adjust_labels import AdjustLabels
from processing_steps.aggregation import TimeSeriesAggregate
from processing_steps.interpolate_nans import InterpolateNaNs
from processing_steps.smoothing import Smooth

from data_reduction.old_disturbance_pruner import OldDisturbancePruner
from data_reduction.detect_disturbed_trees import DetectDisturbedTrees
from data_reduction.timeseries_filter import TimeSeriesFilter

import pandas as pd

PATH = "../../../data/preprocessed/testset.csv"

In [16]:
test_set = pd.read_csv(PATH, parse_dates=["time"])

In [None]:
test_steps = [
    BasicFeatures(on=True),
    TimeSeriesAggregate(on=True, freq=2, method="mean"), 
    InterpolateNaNs(on=True, method="quadratic"), 
    Interpolation(on=True),  
    CalculateIndices(on=True),
    TemporalFeatures(on=True),
]

In [3]:
threshold = 150 # ids with size <150 will be augmented

train_steps = [
    TimeSeriesFilter(on=True),
    BasicFeatures(on=True),
    OldDisturbancePruner(on=True),
    CalculateIndices(on=True),
    DetectDisturbedTrees(on=True),
    AdjustLabels(on=True),
    DataAugmentation(on=True, threshold=threshold),
    TimeSeriesAggregate(on=True, freq=2, method="mean"),
    InterpolateNaNs(on=True, method="quadratic"),
    Smooth(on=True),
    Interpolation(on=True),
    CalculateIndices(on=True), # Second time because of augmentation
    TemporalFeatures(on=True),  
]

pipeline = ProcessingPipeline(path=PATH, steps=train_steps)

df_processed = pipeline.run()
df_processed

0 ids have been removed due to predicted disturbance
Target number of IDs per species: 810


Augmenting species: 100%|██████████| 9/9 [05:51<00:00, 39.08s/it]


Unnamed: 0,time,b2,b3,b4,b5,b6,b7,b8,b8a,b11,...,b8a_smooth,b11_smooth,b12_smooth,month_num,year,season,month_sin,month_cos,date_diff,is_growing_season
0,2016-12-26,,,,,,,,,,...,,,,12,2016,1,-2.449294e-16,1.000000e+00,,0
1,2017-01-09,,,,,,,,,,...,,,,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
2,2017-01-23,,,,,,,,,,...,,,,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
3,2017-02-06,,,,,,,,,,...,,,,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
4,2017-02-20,,,,,,,,,,...,,,,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1548311,2022-09-12,,,806.828496,,,,,,,...,2896.792008,2455.380492,1435.595497,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
1548312,2022-09-26,,,806.828496,,,,,,,...,2803.744263,2407.457697,1299.984563,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
1548313,2022-10-10,,,806.828496,,,,,,,...,,,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1
1548314,2022-10-24,,,806.828496,,,,,,,...,,,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1


In [5]:
df_processed[df_processed["id"].str.contains("aug")]

Unnamed: 0,time,b2,b3,b4,b5,b6,b7,b8,b8a,b11,...,id,species,disturbance_year,month_num,year,season,month_sin,month_cos,date_diff,is_growing_season
1078,2016-12-26,,,,,,,,,,...,10013_aug_1,Norway_spruce,,12,2016,1,-2.449294e-16,1.000000e+00,,0
1079,2017-01-09,138.303437,180.141957,77.915287,379.691490,968.459647,1190.129565,1610.312327,1508.636431,451.685955,...,10013_aug_1,Norway_spruce,,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
1080,2017-01-23,134.875416,198.455386,99.868836,380.018263,1014.348015,1226.747991,1611.490484,1507.744777,451.664576,...,10013_aug_1,Norway_spruce,,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
1081,2017-02-06,131.447394,216.768816,121.822385,380.345036,1060.236382,1263.366416,1612.668642,1506.853123,451.643196,...,10013_aug_1,Norway_spruce,,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
1082,2017-02-20,128.019373,235.082245,143.775935,380.671809,1106.124749,1299.984842,1613.846799,1505.961469,451.621817,...,10013_aug_1,Norway_spruce,,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5236149,2022-09-12,706.421766,827.074650,1045.540397,1248.234006,1456.734187,1711.972047,1973.093499,2027.656519,2421.268337,...,9996_aug_3,Norway_spruce_disturbed,,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
5236150,2022-09-26,724.204336,809.645045,965.655459,1122.206712,1415.695604,1678.315506,1873.031400,1897.487421,2233.797898,...,9996_aug_3,Norway_spruce_disturbed,,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
5236151,2022-10-10,553.208653,683.598834,780.959591,964.693487,1235.096021,1376.723941,1581.428246,1579.112108,2011.583542,...,9996_aug_3,Norway_spruce_disturbed,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1
5236152,2022-10-24,476.062012,634.233187,692.904500,849.657452,1079.862200,1187.980764,1448.344143,1401.789803,1519.491493,...,9996_aug_3,Norway_spruce_disturbed,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1


In [12]:
# df_processed.to_csv("../../../data/processed/trainset_processed.csv", index=False)
df_processed = pd.read_csv("../../../data/processed/trainset_processed.csv", parse_dates=["time"])

In [7]:
test_set.groupby("species").size()

species
Norway_spruce          428955
Norway_spruce_mixed    776877
Scots_pine             645680
beech                  156941
disturbed              327678
oak                     50976
soil                   365678
dtype: int64

In [14]:
df_processed.groupby("species").size()

species
Norway_spruce              578578
Norway_spruce_disturbed    471548
Norway_spruce_mixed        833910
Scots_pine                 865018
Scots_pine_disturbed       478632
beech                      503888
oak                        450450
soil                       630784
soil_disturbed             423500
dtype: int64