# Specify disturbed label

disturbed label in spine_disturbed and norway_spruce_disturbed


## Packages and Data

In [14]:
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [15]:
import pandas as pd
import numpy as np
from utils.constants import spectral_bands, indices
import matplotlib.pyplot as plt
from utils.data_loader import DataLoader
from utils.calculate_indices import CalculateIndices
from utils.preprocessing import Preprocessing

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

bands_and_indices = spectral_bands + indices

In [16]:
dataloader = DataLoader()
df_base = dataloader.load_transform("../../data/Trainings_Datensatz.csv")
df = dataloader.date_feature_extraction(df_base)
df = dataloader.feature_extraction(df)

df = Preprocessing.interpolate_b4(df, method="linear")

In [17]:
calculateindices = CalculateIndices()
df = calculateindices.add_all_indices(df)

## Prepare Data

Per ID one row with key figures. 

For every Band & Index: month_band_mean / std / min / max

-> With this kind of data structure, a simple model is possible.

In [18]:
def calculate_keyfigures_per_id(df, bands_and_indices):
    """
    Aggregates features per 'id' by month and computes key statistics:
    mean, std, min, max for each feature in bands_and_indices
    """
    df = df.copy()
    df["month"] = df["time"].dt.month_name()

    monthly_agg = df.groupby(["id", "month"])[bands_and_indices].agg(
        ["mean", "std", "min", "max"]
    )

    monthly_agg.columns = [
        "_".join([col[0], col[1]]) for col in monthly_agg.columns.values
    ]
    monthly_agg = monthly_agg.reset_index()

    df_train = monthly_agg.pivot(
        index="id", columns="month", values=monthly_agg.columns[1:]
    )

    df_train.columns = ["_".join(col).strip() for col in df_train.columns.values]
    df_train = df_train.reset_index()

    labels = df.groupby("id")["species"].first().reset_index()
    df_train = df_train.merge(labels, on="id", how="left")

    df_train = df_train.drop(
        columns=[col for col in df_train.columns if "month_" in col]
    )

    # dtypes
    feature_cols = df_train.columns.drop("species")
    for col in feature_cols:
        df_train[col] = pd.to_numeric(df_train[col], errors="coerce")

    return df_train

In [19]:
df_train = calculate_keyfigures_per_id(df, bands_and_indices)
df_train

Unnamed: 0,id,b2_mean_April,b2_mean_August,b2_mean_December,b2_mean_February,b2_mean_January,b2_mean_July,b2_mean_June,b2_mean_March,b2_mean_May,...,dvi_max_February,dvi_max_January,dvi_max_July,dvi_max_June,dvi_max_March,dvi_max_May,dvi_max_November,dvi_max_October,dvi_max_September,species
0,1,185.809524,185.300000,55.000000,169.272727,84.333333,187.200000,291.083333,141.272727,181.357143,...,1248.0,971.0,2612.0,2908.0,1584.0,2185.0,1137.0,1561.0,1956.0,Norway_spruce_mixed
1,2,272.727273,278.791667,107.833333,284.000000,196.142857,247.850000,248.210526,237.047619,285.088235,...,1649.0,1489.0,2268.0,2157.0,1592.0,2260.0,1605.0,1680.0,1888.0,Scots_pine
2,3,472.411765,351.210526,550.166667,257.500000,561.000000,382.642857,420.340909,362.900000,468.294118,...,1170.0,1112.0,4744.0,4638.0,1336.0,3189.0,1753.0,4367.0,4861.0,soil
3,4,258.150000,378.269231,92.500000,321.083333,263.625000,297.035714,284.444444,304.558824,248.846154,...,1700.0,1157.0,2326.0,2144.0,1876.0,1851.0,1527.0,2074.0,2010.5,Scots_pine
4,5,657.550000,790.826087,392.000000,497.937500,493.000000,459.129630,377.928571,633.184211,673.692308,...,2123.0,558.0,5731.0,5200.0,2904.0,4550.0,3575.0,4518.0,4530.0,soil
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28207,28208,519.023810,435.000000,454.750000,527.153846,380.000000,481.540000,288.181818,688.315789,401.444444,...,1879.0,617.0,3994.0,4494.0,2056.0,4120.0,1880.0,3398.0,4347.0,soil
28208,28209,222.250000,309.687500,333.250000,184.500000,,255.178571,201.882353,215.789474,221.176471,...,1622.0,,1758.0,1848.0,1491.0,1847.0,1598.0,1699.0,1958.0,disturbed
28209,28210,526.454545,466.230769,329.333333,454.833333,,480.500000,427.375000,512.700000,412.714286,...,2420.0,,3558.0,4760.0,1988.0,4271.0,2034.0,3699.0,3332.0,soil
28210,28211,180.166667,171.380952,79.750000,126.727273,,187.652174,224.181818,180.315789,238.846154,...,1808.0,,2915.0,2682.0,1851.0,2254.0,1723.0,1926.0,2145.0,Norway_spruce


## 1. Specify disturbed label

### Analysis

In [20]:
# How many ids are distrubed but still have "healthy" data
# -> for ids without healthy data it can be difficult to find out the label
# idea: first label disturbed trees with healthy data, and use their disturbed data for ids without healthy data

df_disturbed = df[df["species"] == "disturbed"]
df_disturbed_before_disturbed = df_disturbed[
    df_disturbed["time"].dt.year < df_disturbed["disturbance_year"]
]

df_only_disturbed_data = df_disturbed[
    ~df_disturbed["id"].isin(df_disturbed_before_disturbed["id"])
]

print(df_disturbed.id.nunique())
print(df_disturbed_before_disturbed.id.nunique())
# there are 630 ids where disturbance year <= start year of data (no healthy data available)

3444
2814


In [21]:
df_spruce_pine = df_train[df_train["species"].isin(["Norway_spruce", "Scots_pine"])]
df_spruce_pine = df_spruce_pine.drop(columns="id")
df_ill = df_train[df_train["species"] == "disturbed"]

df_spruce_pine.loc[:, "species"] = df_spruce_pine["species"].map(
    {"Norway_spruce": 0, "Scots_pine": 1}
)

In [22]:
# Correlations
numeric_cols = df_spruce_pine.columns.drop("species")
correlation = df_spruce_pine[numeric_cols].corrwith(df_spruce_pine["species"])
correlation_sorted = correlation.abs().sort_values(ascending=False)

correlation_sorted

b2_min_March        0.746063
b6_mean_October     0.742708
b3_min_March        0.733436
b6_mean_February    0.730664
b2_min_August       0.726976
                      ...   
b2_std_May          0.002991
b8_std_January      0.002883
b8a_std_January     0.002300
dvi_std_March       0.001486
mcari_min_July      0.000632
Length: 960, dtype: float64

### Model

In [23]:
X = df_spruce_pine.drop("species", axis=1)
y = df_spruce_pine["species"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

y_test = y_test.astype(int)

xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=4,
    random_state=42,
    eval_metric="logloss",
)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

y_prob = xgb_model.predict_proba(X_test)[:, 1]

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[1444    9]
 [   6 1833]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1453
           1       1.00      1.00      1.00      1839

    accuracy                           1.00      3292
   macro avg       1.00      1.00      1.00      3292
weighted avg       1.00      1.00      1.00      3292



### Apply model on disturbed ids

In [24]:
df_disturbed_prepared = calculate_keyfigures_per_id(
    df_disturbed_before_disturbed, bands_and_indices
)

ids = df_disturbed_prepared["id"]

X_disturbed = df_disturbed_prepared.drop(columns=["species", "id"])

# Apply model
y_pred_disturbed_class = xgb_model.predict(X_disturbed)

df_disturbed_labels = pd.DataFrame({"id": ids, "species": y_pred_disturbed_class})

label_map = {0: "Norway_spruce_disturbed", 1: "Scots_pine_disturbed"}

df_disturbed_labels["species"] = df_disturbed_labels["species"].map(label_map)
df_disturbed_labels

Unnamed: 0,id,species
0,6,Norway_spruce_disturbed
1,54,Scots_pine_disturbed
2,63,Norway_spruce_disturbed
3,65,Scots_pine_disturbed
4,78,Norway_spruce_disturbed
...,...,...
2809,28139,Norway_spruce_disturbed
2810,28189,Norway_spruce_disturbed
2811,28196,Norway_spruce_disturbed
2812,28205,Scots_pine_disturbed
