In [1]:
from pipelines.processing.processing_pipeline import ProcessingPipeline

from pipelines.processing.features.spectral_indices import CalculateIndices
from pipelines.processing.features.basic_features import BasicFeatures
from pipelines.processing.features.temporal_features import TemporalFeatures

from pipelines.processing.processing_steps.interpolation import Interpolation
from pipelines.processing.processing_steps.data_augmentation import DataAugmentation
from pipelines.processing.processing_steps.adjust_labels import AdjustLabels
from pipelines.processing.processing_steps.aggregation import TimeSeriesAggregate
from pipelines.processing.processing_steps.interpolate_nans import InterpolateNaNs

from pipelines.processing.data_reduction.old_disturbance_pruner import OldDisturbancePruner
from pipelines.processing.data_reduction.detect_disturbed_trees import DetectDisturbedTrees
from pipelines.processing.data_reduction.timeseries_filter import TimeSeriesFilter

import pandas as pd

PATH = "../../../data/preprocessed/trainset.csv"

In [2]:
test_set = pd.read_csv(PATH, parse_dates=["time"])

In [3]:
test_steps = [
    BasicFeatures(on=True),
    TimeSeriesAggregate(on=True, freq=2, method="mean"), 
    InterpolateNaNs(on=True, method="quadratic"),   
    CalculateIndices(on=True),
    TemporalFeatures(on=True),
    Interpolation(on=True),
]

In [4]:
threshold = 150 # ids with size <150 will be augmented

train_steps = [
    TimeSeriesFilter(on=True),
    BasicFeatures(on=True),
    OldDisturbancePruner(on=True),
    CalculateIndices(on=True),
    DetectDisturbedTrees(on=True),
    AdjustLabels(on=True),
    DataAugmentation(on=True, threshold=threshold),
    TimeSeriesAggregate(on=True, freq=2, method="mean"),
    InterpolateNaNs(on=True, method="quadratic"),
    CalculateIndices(on=True), # Second time because of augmentation
    TemporalFeatures(on=True),  
    Interpolation(on=True),

]

pipeline = ProcessingPipeline(path=PATH, steps=train_steps)

df_processed = pipeline.run()
df_processed

0 ids have been removed due to predicted disturbance
Target number of IDs per species: 35


Augmenting species: 100%|██████████| 8/8 [00:05<00:00,  1.43it/s]


Unnamed: 0,time,b2,b3,b4,b5,b6,b7,b8,b8a,b11,...,id,species,disturbance_year,month_num,year,season,month_sin,month_cos,date_diff,is_growing_season
0,2016-12-26,,,,,,,,,,...,101,soil,0.0,12,2016,1,-2.449294e-16,1.000000e+00,,0
1,2017-01-09,,,,,,,,,,...,101,soil,0.0,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
2,2017-01-23,,,,,,,,,,...,101,soil,0.0,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
3,2017-02-06,,,,,,,,,,...,101,soil,0.0,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
4,2017-02-20,732.000000,1035.000000,1476.000000,1668.000000,1994.000000,2169.000000,2445.000000,2488.000000,1701.000000,...,101,soil,0.0,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65137,2022-09-12,267.255586,381.795602,291.640830,593.185901,1434.677584,1620.975596,1713.091766,1892.502258,856.742639,...,99,Norway_spruce_mixed,,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
65138,2022-09-26,275.563454,371.762036,305.528323,581.676834,1282.632866,1434.245113,1504.570156,1680.245747,760.424979,...,99,Norway_spruce_mixed,,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
65139,2022-10-10,283.871322,361.728470,319.415815,570.167766,1130.588147,1247.514629,1296.048545,1467.989236,664.107319,...,99,Norway_spruce_mixed,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1
65140,2022-10-24,292.179190,351.694904,333.303308,558.658699,978.543429,1060.784145,1087.526935,1255.732726,567.789660,...,99,Norway_spruce_mixed,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1


In [5]:
df_processed[df_processed["id"].str.contains("aug")]

Unnamed: 0,time,b2,b3,b4,b5,b6,b7,b8,b8a,b11,...,id,species,disturbance_year,month_num,year,season,month_sin,month_cos,date_diff,is_growing_season
1386,2016-12-26,,,,,,,,,,...,113_aug_1,Norway_spruce,,12,2016,1,-2.449294e-16,1.000000e+00,,0
1387,2017-01-09,337.569513,546.272348,418.453797,611.124742,574.114295,995.882050,763.934724,1264.951566,664.585443,...,113_aug_1,Norway_spruce,,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
1388,2017-01-23,319.711142,522.407387,398.031167,598.624668,656.698935,1077.072106,862.624162,1332.450759,670.682934,...,113_aug_1,Norway_spruce,,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
1389,2017-02-06,301.852772,498.542426,377.608538,586.124595,739.283576,1158.262163,961.313601,1399.949952,676.780425,...,113_aug_1,Norway_spruce,,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
1390,2017-02-20,283.994402,474.677466,357.185908,573.624521,821.868216,1239.452220,1060.003040,1467.449145,682.877916,...,113_aug_1,Norway_spruce,,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64983,2022-09-12,,,352.611774,,,,,,,...,96_aug_4,Scots_pine,,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
64984,2022-09-26,,,352.611774,,,,,,,...,96_aug_4,Scots_pine,,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
64985,2022-10-10,,,352.611774,,,,,,,...,96_aug_4,Scots_pine,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1
64986,2022-10-24,,,352.611774,,,,,,,...,96_aug_4,Scots_pine,,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1


In [6]:
df_processed.describe()

Unnamed: 0,time,b2,b3,b4,b5,b6,b7,b8,b8a,b11,...,mtci,rendvi,disturbance_year,month_num,year,season,month_sin,month_cos,date_diff,is_growing_season
count,65142,63648.0,63648.0,64327.0,63648.0,63648.0,63648.0,63648.0,63648.0,63648.0,...,63648.0,63648.0,22022.0,65142.0,65142.0,65142.0,65142.0,65142.0,64719.0,65142.0
mean,2019-12-02 00:00:00,291.696874,451.458746,383.706139,710.975779,1661.550603,2002.911176,2149.058299,2264.59596,1214.466002,...,1114.262561,0.476459,169.370629,6.402597,2019.428571,2.512987,-0.0006368585,-0.02185731,14.0,0.603896
min,2016-12-26 00:00:00,-2989.023014,-2694.218184,-2495.583669,-1675.278897,-8599.039151,-11401.927539,-9970.429419,-10534.340849,-1595.093555,...,-10594.731097,-434.230085,0.0,1.0,2016.0,1.0,-1.0,-1.0,14.0,0.0
25%,2018-06-11 00:00:00,160.0,284.008491,181.0,465.426495,1221.380361,1463.846135,1570.019056,1681.883995,727.5,...,742.149099,0.388322,0.0,3.0,2018.0,2.0,-0.8660254,-0.5,14.0,0.0
50%,2019-12-02 00:00:00,231.983509,383.557592,274.258991,614.003834,1545.164353,1840.001342,1972.058467,2094.544188,1091.0,...,1025.170313,0.513212,0.0,6.0,2019.0,2.5,1.224647e-16,-1.83697e-16,14.0,1.0
75%,2021-05-24 00:00:00,347.928192,544.710303,470.572096,884.224546,2007.325295,2411.715281,2586.1932,2722.520175,1599.672772,...,1364.877326,0.593093,0.0,9.0,2021.0,3.0,0.8660254,0.5,14.0,1.0
max,2022-11-07 00:00:00,16501.536788,15081.100096,15220.942611,15348.910448,13800.200969,13368.591492,12046.345249,12893.243848,10523.573729,...,7297.064966,78.6858,2020.0,12.0,2022.0,4.0,1.0,1.0,14.0,1.0
std,,272.108659,307.417709,362.607473,400.135392,709.819759,891.159578,936.676641,929.82882,649.441848,...,630.579167,1.841667,559.619794,3.391228,1.705075,1.100404,0.7230041,0.6905087,0.0,0.48909


In [7]:
# df_processed.to_csv("../../../data/processed/testset_processed_notadjustlabels.csv", index=False)
# df_processed = pd.read_csv("../../../data/processed/testset_processed_notadjustlabels.csv", parse_dates=["time"])

In [8]:
test_set.groupby("species").size()

species
Norway_spruce           4842
Norway_spruce_mixed    12068
Scots_pine              9148
beech                   1923
disturbed               4973
oak                      602
soil                    6149
dtype: int64

In [9]:
df_processed.groupby("species").size()

species
Norway_spruce               7392
Norway_spruce_disturbed     5698
Norway_spruce_mixed        12320
Scots_pine                 11550
Scots_pine_disturbed        6776
beech                       6622
oak                         5852
soil                        8778
soil_disturbed               154
dtype: int64