# Tree Classification

In [1]:
import os
import sys
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from utils.data_loader import DataLoader
from utils.calculate_indices import CalculateIndices
from utils.basic_analysis import BasicDataAnalysis
from utils.visualization_histogram import HistogramDataVisualization
from utils.time_series_aggregate import TimeSeriesAggregate
from utils.visualization_spectral import SpectralBandPlotter
from utils.correlation_analysis import CorrelationAnalysis
from utils.visualization_time_series import plot_date_diff_distribution
from utils.sits_outlier_cleaner import SITSOutlierCleaner
from utils.visualization_function import (
    plot_intervals_timestamps,
    plot_top_correlations,
    plot_autocorrelation,
    plot_band_differences,
)
from utils.visualization_anomaly_detection import (
    plot_with_outliers_subplot,
    plot_outlier_detection_grid,
)
from utils.constants import spectral_bands, indices

In [3]:
def get_sample(df, id_col="id", time_col="time", n_ids=40):
    df_sorted = df.sort_values(by=[id_col, time_col])
    unique_ids = df_sorted[id_col].drop_duplicates().head(n_ids)
    return df_sorted[df_sorted[id_col].isin(unique_ids)]

In [4]:
dataloader = DataLoader()
df_base = dataloader.load_transform("../../data/raw/raw_trainset.csv")

In [7]:
df_base.shape

(3927272, 14)

In [6]:
df_base =df_base.drop_duplicates()

In [8]:
df_base[df_base["id"]==16404].sort_values(by= "time")

Unnamed: 0,time,id,disturbance_year,doy,b2,b3,b4,b5,b6,b7,b8,b8a,b11,b12
2289233,2017-01-28,16404,0.0,28.0,234.0,288.0,154.0,405.0,1102.0,1392.0,1184.0,1421.0,453.0,207.0
2289234,2017-02-13,16404,0.0,44.0,257.5,314.0,250.0,492.5,1236.0,1354.5,1514.5,1581.0,611.0,301.5
2289235,2017-03-26,16404,0.0,85.0,190.0,290.0,210.0,475.0,1163.0,1437.0,1482.0,1526.0,833.0,387.0
2289236,2017-05-25,16404,0.0,145.0,158.0,277.0,177.0,478.0,1565.0,1823.0,1849.0,1985.0,911.0,420.0
2289237,2017-08-05,16404,0.0,217.0,182.0,347.0,199.0,526.0,1969.0,2353.0,2565.0,2440.0,989.0,393.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2289410,2022-09-22,16404,0.0,265.0,498.0,630.0,472.0,849.0,1959.0,2317.0,2564.0,2604.0,1262.0,847.0
2289411,2022-09-30,16404,0.0,273.0,183.0,313.0,195.0,484.0,1648.0,1977.0,2000.0,2171.0,765.0,338.0
2289412,2022-10-24,16404,0.0,297.0,334.0,438.0,343.0,565.0,1520.0,1801.0,1970.0,1995.0,674.0,346.0
2289413,2022-10-31,16404,0.0,304.0,381.0,523.0,453.0,715.0,1484.0,1846.0,2090.0,2275.0,814.0,429.0


In [14]:
df[df["id"]==16404].sort_values(by= "time")

Unnamed: 0,time,id,disturbance_year,doy,b2,b3,b4,b5,b6,b7,b8,b8a,b11,b12,is_disturbed,month_num,year,season,date_diff
2289233,2017-01-28,16404,0.0,28.0,234.0,288.0,154.0,405.0,1102.0,1392.0,1184.0,1421.0,453.0,207.0,False,1,2017,Winter,
2289234,2017-02-13,16404,0.0,44.0,257.5,314.0,250.0,492.5,1236.0,1354.5,1514.5,1581.0,611.0,301.5,False,2,2017,Winter,16.0
2289235,2017-03-26,16404,0.0,85.0,190.0,290.0,210.0,475.0,1163.0,1437.0,1482.0,1526.0,833.0,387.0,False,3,2017,Spring,41.0
2289236,2017-05-25,16404,0.0,145.0,158.0,277.0,177.0,478.0,1565.0,1823.0,1849.0,1985.0,911.0,420.0,False,5,2017,Spring,60.0
2289237,2017-08-05,16404,0.0,217.0,182.0,347.0,199.0,526.0,1969.0,2353.0,2565.0,2440.0,989.0,393.0,False,8,2017,Summer,72.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2289410,2022-09-22,16404,0.0,265.0,498.0,630.0,472.0,849.0,1959.0,2317.0,2564.0,2604.0,1262.0,847.0,False,9,2022,Autumn,14.0
2289411,2022-09-30,16404,0.0,273.0,183.0,313.0,195.0,484.0,1648.0,1977.0,2000.0,2171.0,765.0,338.0,False,9,2022,Autumn,8.0
2289412,2022-10-24,16404,0.0,297.0,334.0,438.0,343.0,565.0,1520.0,1801.0,1970.0,1995.0,674.0,346.0,False,10,2022,Autumn,24.0
2289413,2022-10-31,16404,0.0,304.0,381.0,523.0,453.0,715.0,1484.0,1846.0,2090.0,2275.0,814.0,429.0,False,10,2022,Autumn,7.0


## Basic Data Analysis

In [9]:
basicanalysis = BasicDataAnalysis(df_base)
basicanalysis.get_dtypes()

time                datetime64[ns]
id                           int64
disturbance_year           float64
doy                        float64
b2                         float64
b3                         float64
b4                         float64
b5                         float64
b6                         float64
b7                         float64
b8                         float64
b8a                        float64
b11                        float64
b12                        float64
dtype: object

In [10]:
print(f"Raw Dataset cols:{basicanalysis.get_num_cols()}")
print(f"Raw Dataset rows:{basicanalysis.get_num_rows()}")

Raw Dataset cols:14
Raw Dataset rows:3927272


In [11]:
basicanalysis.get_desricption()

AttributeError: 'BasicDataAnalysis' object has no attribute 'get_desricption'

In [None]:
basicanalysis.get_missing_counts()

## Feature Engineering

### Basic Feature Engineering

In [12]:
df = dataloader.feature_extraction(df_base)

### Datetime Feature Engineering

In [13]:
df = dataloader.date_feature_extraction(df)

### Advanced Feature Engineering

In [None]:
calcindices = CalculateIndices()
df = calcindices.add_all_indices(df)

## Exploratory Data Analysis

### Visualization Base-Dataframe 

In [None]:
hist = HistogramDataVisualization(df)
hist.plot_unique_ids("year")

In [None]:
hist.plot_unique_ids("month_num")

In [None]:
hist.plot_unique_ids("species")

In [None]:
hist.plot_median_id_distribution()

## Correlation Analysis

In [None]:
correlation = CorrelationAnalysis(df)
corr_matrix = correlation.get_correlation_matrix()
corr_matrix

In [None]:
correlation.plot_correlation_matrix()

In [None]:
top_corr_df = correlation.get_top_correlations(top_n=15)
plot_top_correlations(top_corr_df)

In [None]:
# correlation.plot_correlation_distribution(sample_size=100)

## Disturbed

In [None]:
unique_values = df["disturbance_year"].unique()
print(f"Nunique: {len(unique_values)}")
print(f"Unique Values:\n{unique_values}")

In [None]:
dist_disturbance_df = df["is_disturbed"].value_counts().reset_index()
dist_disturbance_df.columns = ["is_disturbed", "count"]


plt.figure(figsize=(8, 5))
plt.bar(dist_disturbance_df["is_disturbed"].astype(str), dist_disturbance_df["count"])
plt.xlabel("Disturbed")
plt.ylabel("Count")
plt.title("Comparison of distrubed values")
plt.grid()
plt.show()

In [None]:
filtered = df[df["disturbance_year"] != 0]
crosstab = pd.crosstab(filtered["disturbance_year"], filtered["species"])
crosstab.plot(kind="bar", stacked=True, figsize=(10, 6))

plt.xlabel("Disturbance Year")
plt.ylabel("Anzahl")
plt.title("Distribution of Disturbance Year by Species")
plt.legend(title="Species")
plt.tight_layout()
plt.grid()
plt.show()

In [None]:
test = df[df["is_disturbed"]].copy()
test["disturbance_year_diff"] = test["disturbance_year"] - test["year"]
test

In [None]:
values = test["disturbance_year_diff"].dropna()

fig = plt.figure(figsize=(10, 6))
grid = fig.add_gridspec(2, 1, height_ratios=[1, 4], hspace=0.05)
ax_box = fig.add_subplot(grid[0, 0])
ax_box.boxplot(values, vert=False, patch_artist=True)
ax_box.set(xticks=[], xlabel="")
ax_box.set_yticks([])
ax_box.set_title("Distribution of Disturbance Year Differences")
ax_hist = fig.add_subplot(grid[1, 0])
ax_hist.hist(values, bins=30, alpha=0.7, edgecolor="black")
ax_hist.set_xlabel("disturbance_year_diff")
ax_hist.set_ylabel("Frequency")
ax_hist.grid()
plt.show()

In [None]:
filtered.sort_values(by="disturbance_year", ascending=True)

In [None]:
filtered = df[df["disturbance_year"] != 0]
id_df = filtered[filtered["id"] == 11759]
id_df

In [None]:
band_columns = [col for col in id_df.columns if col.startswith("b")]
fig = px.line(id_df, x="time", y=band_columns, markers=":")
fig.show()

## Explore Spectral

In [None]:
spectral = SpectralBandPlotter(df)
spectral.plot_all_years(sample_size=500, showfliers=True)

In [None]:
spectral.plot_per_year(sample_size=500)

In [None]:
spectral.plot_species_season_distribution()

## Time Series Analysis

In [None]:
plot_date_diff_distribution(df)

In [None]:
ts_agg = TimeSeriesAggregate(id_df)
df_2w = ts_agg.aggregate_timeseries(freq="2W", method="median")
dataloader = DataLoader()
df_2w_features = dataloader.date_feature_extraction(df_2w)

In [None]:
spectral = SpectralBandPlotter(df_2w_features)
spectral.plot_spectral_development_over_years(addition="aggregated")

In [None]:
plot_intervals_timestamps(df_2w_features, addition="aggregated")

### Autocorrelation

In [None]:
plot_autocorrelation(df_2w_features, "ndvi")

In [None]:
plot_band_differences(df_2w_features)

# Anomaly Detection

In [None]:
df_sample = get_sample(df, n_ids=40)

cleaner = SITSOutlierCleaner()
cleaner.fit_transform(df_sample, band_columns=spectral_bands)
df_with_any_flag = cleaner.add_any_outlier_flag()
id_df = df_with_any_flag[df_with_any_flag["id"] == 24]
df_interpolated = cleaner.get_interpolated_only()

In [None]:
plot_with_outliers_subplot(id_df, spectral_bands)

In [None]:
plot_outlier_detection_grid(id_df, bands=spectral_bands)

In [None]:
df_interpolated