# Tree Classification

In [1]:
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from utils.data_loader import DataLoader
from utils.calculate_indices import CalculateIndices
from utils.basic_analysis import BasicDataAnalysis
from utils.visualization_histogram import HistogramDataVisualization
from utils.time_series_aggregate import TimeSeriesAggregate
from utils.visualization_spectral import SpectralBandPlotter


from utils.data_loader import DataLoader
from utils.basic_analysis import BasicDataAnalysis

from utils.correlation_analysis import CorrelationAnalysis

from utils.time_series_aggregate import TimeSeriesAggregate


from utils.visualization_spectral import SpectralBandPlotter
from utils.visualization_histogram import HistogramDataVisualization
from utils.visualization_function import (
    plot_intervals_timestamps,
    plot_top_correlations,
    plot_autocorrelation
)

In [2]:
dataloader = DataLoader()
df_base = dataloader.load_transform("../../data/raw_trainset.csv")

## Basic Data Analysis

In [3]:
basicanalysis = BasicDataAnalysis(df_base)
basicanalysis.get_dtypes()

time                datetime64[ns]
id                           int64
disturbance_year             int64
doy                          int64
b2                           int64
b3                           int64
b4                           int64
b5                           int64
b6                           int64
b7                           int64
b8                           int64
b8a                          int64
b11                          int64
b12                          int64
species                     object
dtype: object

In [4]:
print(f"Raw Dataset cols:{basicanalysis.get_num_cols()}")
print(f"Raw Dataset rows:{basicanalysis.get_num_rows()}")

Raw Dataset cols:15
Raw Dataset rows:4074354


In [5]:
basicanalysis.get_desricption()

Unnamed: 0,time,id,disturbance_year,doy,b2,b3,b4,b5,b6,b7,b8,b8a,b11,b12
count,4074354,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0
mean,2020-02-05 05:16:36.360551936,14086.75,252.2988,176.3385,272.6073,430.9495,346.5769,676.4691,1683.092,2040.237,2166.979,2290.828,1158.527,601.5298
min,2017-01-01 00:00:00,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
25%,2018-09-24 00:00:00,7038.0,0.0,106.0,156.0,281.0,175.0,460.0,1268.0,1529.0,1629.0,1741.0,730.0,336.0
50%,2020-03-15 00:00:00,14063.0,0.0,176.0,223.0,371.0,257.0,588.0,1547.0,1856.0,1971.0,2092.0,1011.0,481.0
75%,2021-06-16 00:00:00,21133.0,0.0,245.0,322.0,503.0,399.0,784.0,1924.0,2327.0,2464.0,2610.0,1441.0,709.0
max,2022-11-02 00:00:00,28212.0,2020.0,366.0,20087.0,17534.0,17524.0,17940.0,19827.0,21439.0,21811.0,24694.0,18554.0,17587.0
std,,8145.028,667.5001,85.21531,220.2539,260.2215,308.6128,348.2985,645.7464,802.4451,853.2377,859.7645,609.2414,419.6677


In [6]:
basicanalysis.get_missing_counts()

time                0
id                  0
disturbance_year    0
doy                 0
b2                  0
b3                  0
b4                  0
b5                  0
b6                  0
b7                  0
b8                  0
b8a                 0
b11                 0
b12                 0
species             0
dtype: int64

## Feature Engineering

In [None]:
Basic Feature Engineering

### Datetime Feature Engineering

In [None]:
df = dataloader.date_feature_extraction(df_base)

### Advanced Feature Engineering

In [None]:
calcindices = CalculateIndices()
df = calcindices.add_all_indices(df)

In [None]:
df

## Exploratory Data Analysis

### Visualization Base-Dataframe 

In [None]:
hist = HistogramDataVisualization(df)
hist.plot_unique_ids("year")

In [None]:
hist.plot_unique_ids("month_num")

In [None]:
hist.plot_unique_ids("species")

In [None]:
hist.plot_median_id_distribution()

## Disturbed

In [None]:
# Boolesche Spalte erstellen
df["disturbed"] = df["disturbance_year"].apply(lambda x: False if x == 0 else True)

In [None]:
unique_values = df["disturbance_year"].unique()
print(f"Nunique: {len(unique_values)}")
print(f"Unique Values:\n{unique_values}")

In [None]:
dist_disturbance_df = df["disturbed"].value_counts().reset_index()
dist_disturbance_df.columns = ["disturbed", "count"]


plt.figure(figsize=(8, 5))
plt.bar(dist_disturbance_df["disturbed"].astype(str), dist_disturbance_df["count"])
plt.xlabel("Disturbed")
plt.ylabel("Count")
plt.title("Comparison of distrubed values")
plt.grid()
plt.show()


In [None]:
filtered = df[df["disturbance_year"] != 0]
crosstab = pd.crosstab(filtered["disturbance_year"], filtered["species"])
crosstab.plot(kind="bar", stacked=True, figsize=(10, 6))

plt.xlabel("Disturbance Year")
plt.ylabel("Anzahl")
plt.title("Distribution of Disturbance Year by Species")
plt.legend(title="Species")
plt.tight_layout()
plt.grid()
plt.show()

In [None]:
filtered.sort_values(by="disturbance_year", ascending=True)

In [None]:
filtered = df[df["disturbance_year"] != 0]
id_df = filtered[filtered["id"]==11759]
id_df

In [None]:
band_columns = [col for col in id_df.columns if col.startswith("b")]
fig = px.line(
    id_df,
    x="time",
    y= band_columns,
    markers="o")
fig.show()

## Correlation Analysis

In [None]:
correlation = CorrelationAnalysis(df)
corr_matrix = correlation.get_correlation_matrix()
corr_matrix

In [None]:
top_corr_df = correlation.get_top_correlations(top_n=15)
plot_top_correlations(top_corr_df)

In [None]:
correlation.plot_correlation_matrix()

In [None]:
correlation.plot_correlation_distribution(sample_size= 100)

## Explore Spectral

In [None]:
spectral = SpectralBandPlotter(df)
spectral.plot_all_years(sample_size=500, showfliers=True)

In [None]:
spectral.plot_per_year(sample_size=500)

In [None]:
spectral.plot_species_season_distribution()

## Time Series Analysis

In [None]:
data = df["date_diff"].dropna()
median_val = data.median()

plt.figure(figsize=(10, 5))
plt.hist(data, bins=20, color="black")

plt.axvline(
    median_val,
    color="red",
    linestyle="--",
    linewidth=2,
    label=f"Median: {median_val:.1f}",
)

plt.title("Distribution of Time Gaps (in Days)")
plt.xlabel("Days Between Observations")
plt.ylabel("Count")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
ts_agg = TimeSeriesAggregate(id_df)
df_2w = ts_agg.aggregate_timeseries(freq="2W", method="median")
dataloader = DataLoader()
df_2w_features = dataloader.date_feature_extraction(df_2w)

In [None]:
spectral = SpectralBandPlotter(df_2w_features)
spectral.plot_spectral_development_over_years(addition="aggregated")