In [None]:
import pandas as pd

In [1]:
from pipelines.processing.processing_pipeline import ProcessingPipeline

from pipelines.processing.features.spectral_indices import CalculateIndices
from pipelines.processing.features.basic_features import BasicFeatures
from pipelines.processing.features.temporal_features import TemporalFeatures

from pipelines.processing.processing_steps.interpolation import Interpolation
from pipelines.processing.processing_steps.data_augmentation import DataAugmentation
from pipelines.processing.processing_steps.adjust_labels import AdjustLabels
from pipelines.processing.processing_steps.aggregation import TimeSeriesAggregate
from pipelines.processing.processing_steps.interpolate_nans import InterpolateNaNs
from pipelines.processing.processing_steps.smoothing import Smooth

from pipelines.processing.data_reduction.old_disturbance_pruner import (
    OldDisturbancePruner,
)
from pipelines.processing.data_reduction.detect_disturbed_trees import (
    DetectDisturbedTrees,
)
from pipelines.processing.data_reduction.timeseries_filter import TimeSeriesFilter

import pandas as pd

PATH = "../../../data/processed/trainset.csv"

In [2]:
test_set = pd.read_csv(PATH, parse_dates=["time"])

In [3]:
test_steps = [
    BasicFeatures(on=True),
    TimeSeriesAggregate(on=True, freq=2, method="mean"),
    InterpolateNaNs(on=False, method="linear"),
    Interpolation(on=True),
    CalculateIndices(on=True),
    TemporalFeatures(on=True),
]

In [4]:
threshold = 150  # ids with size <150 will be augmented

train_steps = [
    TimeSeriesFilter(on=True),
    BasicFeatures(on=True),
    OldDisturbancePruner(on=True),
    CalculateIndices(on=True),
    DetectDisturbedTrees(on=True),
    AdjustLabels(on=True),
    DataAugmentation(on=False, threshold=threshold),
    TimeSeriesAggregate(on=True, freq=2, method="mean"),
    InterpolateNaNs(on=False, method="quadratic"),
    Smooth(on=False),
    Interpolation(on=True),
    CalculateIndices(on=True),  # Second time because of augmentation
    TemporalFeatures(on=True),
]

pipeline = ProcessingPipeline(path=PATH, steps=train_steps)

df_processed = pipeline.run()
df_processed.describe()

7599 ids have been removed due to predicted disturbance


ValueError: You are trying to merge on object and float64 columns for key 'id'. If you wish to proceed you should use pd.concat

In [None]:
df_processed[df_processed["id"].str.contains("aug")]

Unnamed: 0,time,b2,b3,b4,b5,b6,b7,b8,b8a,b11,...,b8a_smooth,b11_smooth,b12_smooth,month_num,year,season,month_sin,month_cos,date_diff,is_growing_season
616,2016-12-26,,,,,,,,,,...,,,,12,2016,1,-2.449294e-16,1.000000e+00,,0
617,2017-01-09,-677.271849,-413.821854,-415.245594,-439.261008,-290.590671,-46.626412,-137.752327,-191.414628,-205.180896,...,-191.414628,-205.180896,-15.277812,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
618,2017-01-23,-476.862842,-249.333666,-272.880578,-254.324632,19.494733,279.472429,229.395476,218.839697,-48.862601,...,13.712535,-127.021749,19.156994,1,2017,1,5.000000e-01,8.660254e-01,14.0,0
619,2017-02-06,-276.453834,-84.845478,-130.515562,-69.388255,329.580136,605.571269,596.543279,629.094023,107.455694,...,218.839697,-48.862601,53.591799,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
620,2017-02-20,-76.044826,79.642710,11.849454,115.548121,639.665540,931.670110,963.691082,1039.348348,263.773990,...,629.094023,107.455694,122.461411,2,2017,1,8.660254e-01,5.000000e-01,14.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14163,2022-09-12,267.199120,377.182200,289.451819,592.879528,1473.918935,1588.164249,1713.235195,1898.370647,873.826452,...,2109.934102,970.382391,457.582854,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
14164,2022-09-26,275.304897,366.965763,303.148368,580.803070,1321.657143,1401.342615,1507.774598,1686.807193,777.270514,...,1898.370647,873.826452,429.415645,9,2022,4,-1.000000e+00,-1.836970e-16,14.0,1
14165,2022-10-10,283.410674,356.749326,316.844917,568.726613,1169.395351,1214.520981,1302.314000,1475.243738,680.714575,...,1686.807193,777.270514,401.248437,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1
14166,2022-10-24,291.516451,346.532889,330.541466,556.650155,1017.133559,1027.699348,1096.853403,1263.680283,584.158636,...,1475.243738,680.714575,373.081228,10,2022,4,-8.660254e-01,5.000000e-01,14.0,1


In [6]:
# df_processed.to_csv("../../../data/processed/trainset_processed.csv", index=False)
df_processed = pd.read_csv(
    "../../../data/processed/trainset_processed.csv", parse_dates=["time"]
)

FileNotFoundError: [Errno 2] No such file or directory: '../../../data/processed/trainset_processed.csv'

In [None]:
test_set.groupby("species").size()

In [None]:
df_processed.groupby("species").size()