# Data Reduction Disturbed Trees

In [7]:
import pandas as pd
from data_reduction_utils.dist_functions import (
    evaluate_time_windows,
    prepare_dist_df,
    prepare_df,
    plot_distribution,
)
from pipelines.processing.data_reduction.old_disturbance_pruner import (
    OldDisturbancePruner,
)
from general_utils.constants import spectral_bands

In [2]:
df_base = pd.read_csv("../../../data/raw/raw_trainset.csv", parse_dates=["time"])
dist_bruner = OldDisturbancePruner()
df = dist_bruner.run(df_base)
df = df[df["species"] != "soil"]

dist_df = df.copy()
dist_df = dist_df[dist_df["disturbance_year"] != 0]
dist_df["year"] = dist_df["time"].dt.year
dist_df["relative_year"] = dist_df["year"] - dist_df["disturbance_year"]
dist_df["species_disturbed"] = (dist_df["species"] == "disturbed").astype(int)

In [3]:
plot_distribution(dist_df)

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import GroupShuffleSplit
import plotly.express as px

df = pd.read_csv("../../../data/raw/raw_trainset.csv", parse_dates=["time"])
df["species_disturbed"] = (df["species"] == "disturbed").astype(int)
df["year"] = df["time"].dt.year
df["relative_year"] = df["year"] - df["disturbance_year"]

results = []

for window in range(0, 5):
    subset = df[(df["relative_year"] >= -window) & (df["relative_year"] <= window)]
    X, y, groups = subset[spectral_bands], subset["species_disturbed"], subset["id"]
    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    train_idx, test_idx = next(gss.split(X, y, groups=groups))
    model = RandomForestClassifier(random_state=42, class_weight="balanced", n_estimators=10)
    model.fit(X.iloc[train_idx], y.iloc[train_idx])
    preds = model.predict(X.iloc[test_idx])
    acc = balanced_accuracy_score(y.iloc[test_idx], preds)
    results.append({"years_around_event": f"{-window}..{window}", "balanced_accuracy": acc})

results_df = pd.DataFrame(results)
fig = px.line(results_df, x="years_around_event", y="balanced_accuracy", markers=True,
              title="Balanced Accuracy for Different Time Windows Around Disturbance",
              labels={"years_around_event": "Time Window (Years)", "balanced_accuracy": "Balanced Accuracy"})
fig.show()
