In [1]:
import pandas as pd

In [2]:
from pipelines.processing.processing_pipeline import ProcessingPipeline

from pipelines.processing.features.spectral_indices import CalculateIndices
from pipelines.processing.features.basic_features import BasicFeatures
from pipelines.processing.features.temporal_features import TemporalFeatures

from pipelines.processing.processing_steps.interpolation import Interpolation
from pipelines.processing.processing_steps.data_augmentation import DataAugmentation
from pipelines.processing.processing_steps.adjust_labels import AdjustLabels
from pipelines.processing.processing_steps.aggregation import TimeSeriesAggregate
from pipelines.processing.processing_steps.interpolate_nans import InterpolateNaNs
from pipelines.processing.processing_steps.smoothing import Smooth

from pipelines.processing.data_reduction.old_disturbance_pruner import (
    OldDisturbancePruner,
)
from pipelines.processing.data_reduction.detect_disturbed_trees import (
    DetectDisturbedTrees,
)
from pipelines.processing.data_reduction.timeseries_filter import TimeSeriesFilter

import pandas as pd

PATH = "../../../data/raw/splits/trainset.csv"

In [3]:
train_raw = pd.read_csv(PATH, parse_dates=["time"])

In [None]:
test_steps = [
    BasicFeatures(on=True),
    TimeSeriesAggregate(on=True, freq=2, method="mean"),
    InterpolateNaNs(on=False, method="linear"),
    Interpolation(on=True),
    CalculateIndices(on=True),
    TemporalFeatures(on=True),
]

In [None]:
threshold = 150  # ids with size <150 will be augmented

train_steps = [
    TimeSeriesFilter(on=True, max_median_diff_days=23),
    BasicFeatures(on=False),
    OldDisturbancePruner(on=False),
    CalculateIndices(on=False),
    DetectDisturbedTrees(on=False),
    AdjustLabels(on=False),
    DataAugmentation(on=False, threshold=threshold),
    TimeSeriesAggregate(on=False, freq=2, method="mean"),
    InterpolateNaNs(on=False, method="quadratic"),
    Smooth(on=False),
    Interpolation(on=False),
    CalculateIndices(on=False), # Second time because of augmentation
    TemporalFeatures(on=False),  
]

pipeline = ProcessingPipeline(path=PATH, steps=train_steps)

df_processed = pipeline.run()
df_processed.describe()

In [6]:
import pandas as pd
import xgboost as xgb
import joblib
from sklearn.preprocessing import LabelEncoder
from pathlib import Path

from pipelines.preprocessing.run_preprocessing_pipeline import run_preprocessing_pipeline
from pipelines.processing.processing_pipeline import ProcessingPipeline
from pipelines.processing.features.spectral_indices import CalculateIndices
from pipelines.processing.features.basic_features import BasicFeatures
from pipelines.processing.features.temporal_features import TemporalFeatures
from pipelines.processing.processing_steps.interpolation import Interpolation

from models.baseline_model.baseline_model_utils import drop_unwanted_columns, split_into_X_y, evaluate_model
from models.baseline_model.calculate_keyfigures import StatisticalFeatures
from general_utils.constants import spectral_bands, indices

bands_and_indices = spectral_bands + indices 

BASE_DIR = Path().resolve()
SPLITS_PATH = BASE_DIR / "../../../data/raw/splits"

PATH_TRAIN = SPLITS_PATH / "trainset.csv"
PATH_TEST = SPLITS_PATH / "testset.csv"
PATH_VAL = SPLITS_PATH / "valset.csv"

# Define processing pipeline steps
test_steps = [
    BasicFeatures(on=True),
    TimeSeriesAggregate(on=False, freq=2, method="mean"), 
    InterpolateNaNs(on=False, method="linear"), 
    Interpolation(on=False),  
    CalculateIndices(on=True),
    TemporalFeatures(on=False),
]


threshold = 150
train_steps = [
    TimeSeriesFilter(on=True, max_median_diff_days=25),
    BasicFeatures(on=True),
    OldDisturbancePruner(on=False),
    CalculateIndices(on=True),
    DetectDisturbedTrees(on=True),
    AdjustLabels(on=False),
    DataAugmentation(on=False, threshold=threshold),
    TimeSeriesAggregate(on=False, freq=2, method="mean"),
    InterpolateNaNs(on=False, method="quadratic"),
    Smooth(on=False),
    Interpolation(on=False),
    CalculateIndices(on=True), # Second time because of augmentation
    TemporalFeatures(on=False),  
]


print("Running processing pipeline for training data...")
pipeline_train = ProcessingPipeline(path=PATH_TRAIN, steps=train_steps)
df_train = pipeline_train.run()

print("Running processing pipeline for test data...")
pipeline_test = ProcessingPipeline(path=PATH_TEST, steps=test_steps)
df_test = pipeline_test.run()

print("Running processing pipeline for val data...")
pipeline_test = ProcessingPipeline(path=PATH_VAL, steps=test_steps)
df_val = pipeline_test.run()


df_train = drop_unwanted_columns(df_train)
df_test = drop_unwanted_columns(df_test)
df_val = drop_unwanted_columns(df_val)

sf = StatisticalFeatures()
df_train = sf.calculate_keyfigures_per_id(df_train, bands_and_indices)
df_test = sf.calculate_keyfigures_per_id(df_test, bands_and_indices)
df_val = sf.calculate_keyfigures_per_id(df_val, bands_and_indices)

# Encode labels
le = LabelEncoder()
df_train["species_encoded"] = le.fit_transform(df_train["species"])

X_train, y_train, X_test = split_into_X_y(df_train, df_test)

# Train model
xgb_baseline_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=10,
    random_state=42,
    eval_metric="mlogloss",
    objective="multi:softprob",
    num_class=len(le.classes_),
)

print("Training model...")
xgb_baseline_model.fit(X_train, y_train)

evaluate_model(xgb_baseline_model, X_test, df_test, le)

df_val["species_encoded"] = le.transform(df_val["species"])
_, _, X_val = split_into_X_y(df_train, df_val)
evaluate_model(xgb_baseline_model, X_val, df_val, le)

Running processing pipeline for training data...
(2752785, 15)
Confusion Matrix:
 [[2960   40]
 [  48  713]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      3000
           1       0.95      0.94      0.94       761

    accuracy                           0.98      3761
   macro avg       0.97      0.96      0.96      3761
weighted avg       0.98      0.98      0.98      3761

144 ids have been removed due to predicted disturbance
Running processing pipeline for test data...
(784534, 15)
Running processing pipeline for val data...
(389953, 15)
Training model...
Predicting on test data...

Confusion Matrix:
[[ 830  152    3    1    4    0    4]
 [  59 1541   20   19   15    4    4]
 [   0   15 1167    0    9    1    1]
 [   0   19    0  272    1    1    2]
 [  17   30   55    1  576    0    5]
 [   0    6    1    4    2   79    1]
 [   0    6    2    1    8    1  703]]

Classification Report:
              

In [7]:
df_train.id.nunique()
#19623

19584

In [8]:
train_raw.id.nunique()
# 19748

19748