# Tree Classification

In [None]:
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from utils.data_loader import DataLoader
from utils.calculate_indices import CalculateIndices
from utils.basic_analysis import BasicDataAnalysis
from utils.visualization_histogram import HistogramDataVisualization
from utils.time_series_aggregate import TimeSeriesAggregate
from utils.visualization_spectral import SpectralBandPlotter


from utils.data_loader import DataLoader
from utils.basic_analysis import BasicDataAnalysis

from utils.correlation_analysis import CorrelationAnalysis

from utils.time_series_aggregate import TimeSeriesAggregate
from utils.visualization_time_series import plot_date_diff_distribution


from utils.visualization_spectral import SpectralBandPlotter
from utils.visualization_histogram import HistogramDataVisualization
from utils.visualization_function import (
    plot_intervals_timestamps,
    plot_top_correlations,
    plot_autocorrelation,
    plot_band_differences,
)

In [None]:
dataloader = DataLoader()
df_base = dataloader.load_transform("../../data/raw_trainset.csv")

## Basic Data Analysis

In [None]:
basicanalysis = BasicDataAnalysis(df_base)
basicanalysis.get_dtypes()

In [None]:
print(f"Raw Dataset cols:{basicanalysis.get_num_cols()}")
print(f"Raw Dataset rows:{basicanalysis.get_num_rows()}")

In [None]:
basicanalysis.get_desricption()

In [None]:
basicanalysis.get_missing_counts()

## Feature Engineering

### Basic Feature Engineering

In [None]:
df = dataloader.feature_extraction(df_base)

### Datetime Feature Engineering

In [None]:
df = dataloader.date_feature_extraction(df)

### Advanced Feature Engineering

In [None]:
calcindices = CalculateIndices()
df = calcindices.add_all_indices(df)

## Exploratory Data Analysis

### Visualization Base-Dataframe 

In [None]:
hist = HistogramDataVisualization(df)
hist.plot_unique_ids("year")

In [None]:
hist.plot_unique_ids("month_num")

In [None]:
hist.plot_unique_ids("species")

In [None]:
hist.plot_median_id_distribution()

## Correlation Analysis

In [None]:
correlation = CorrelationAnalysis(df)
corr_matrix = correlation.get_correlation_matrix()
corr_matrix

In [None]:
correlation.plot_correlation_matrix()

In [None]:
top_corr_df = correlation.get_top_correlations(top_n=15)
plot_top_correlations(top_corr_df)

In [None]:
correlation.plot_correlation_distribution(sample_size=100)

## Disturbed

In [None]:
unique_values = df["disturbance_year"].unique()
print(f"Nunique: {len(unique_values)}")
print(f"Unique Values:\n{unique_values}")

In [None]:
dist_disturbance_df = df["disturbed"].value_counts().reset_index()
dist_disturbance_df.columns = ["disturbed", "count"]


plt.figure(figsize=(8, 5))
plt.bar(dist_disturbance_df["disturbed"].astype(str), dist_disturbance_df["count"])
plt.xlabel("Disturbed")
plt.ylabel("Count")
plt.title("Comparison of distrubed values")
plt.grid()
plt.show()


In [None]:
filtered = df[df["disturbance_year"] != 0]
crosstab = pd.crosstab(filtered["disturbance_year"], filtered["species"])
crosstab.plot(kind="bar", stacked=True, figsize=(10, 6))

plt.xlabel("Disturbance Year")
plt.ylabel("Anzahl")
plt.title("Distribution of Disturbance Year by Species")
plt.legend(title="Species")
plt.tight_layout()
plt.grid()
plt.show()

In [None]:
filtered.sort_values(by="disturbance_year", ascending=True)

In [None]:
filtered = df[df["disturbance_year"] != 0]
id_df = filtered[filtered["id"] == 11759]
id_df

In [None]:
band_columns = [col for col in id_df.columns if col.startswith("b")]
fig = px.line(id_df, x="time", y=band_columns, markers="o")
fig.show()

## Explore Spectral

In [None]:
spectral = SpectralBandPlotter(df)
spectral.plot_all_years(sample_size=500, showfliers=True)

In [None]:
spectral.plot_per_year(sample_size=500)

In [None]:
spectral.plot_species_season_distribution()

## Time Series Analysis

In [None]:
plot_date_diff_distribution(df)

In [None]:
ts_agg = TimeSeriesAggregate(id_df)
df_2w = ts_agg.aggregate_timeseries(freq="2W", method="median")
dataloader = DataLoader()
df_2w_features = dataloader.date_feature_extraction(df_2w)

In [None]:
spectral = SpectralBandPlotter(df_2w_features)
spectral.plot_spectral_development_over_years(addition="aggregated")

In [None]:
plot_intervals_timestamps(df_2w_features, addition="aggregated")

### Autocorrelation

In [None]:
plot_autocorrelation(df_2w_features, "ndvi")

In [None]:
plot_band_differences(df_2w_features)