In [1]:
import os
import sys
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from utils.data_loader import DataLoader
from utils.calculate_indices import CalculateIndices
from utils.basic_analysis import BasicDataAnalysis
from utils.visualization_histogram import HistogramDataVisualization
from utils.time_series_aggregate import TimeSeriesAggregate
from utils.visualization_spectral import SpectralBandPlotter
from utils.correlation_analysis import CorrelationAnalysis
from utils.visualization_time_series import plot_date_diff_distribution
from utils.sits_outlier_cleaner import SITSOutlierCleaner

from utils.visualization_anomaly_detection import (
    plot_with_outliers_subplot,
    plot_outlier_detection_grid,
)
from utils.constants import spectral_bands

# Tree Classification

In [3]:
def get_sample(df, id_col="id", time_col="time", n_ids=40):
    df_sorted = df.sort_values(by=[id_col, time_col])
    unique_ids = df_sorted[id_col].drop_duplicates().head(n_ids)
    return df_sorted[df_sorted[id_col].isin(unique_ids)]

In [4]:
dataloader = DataLoader()
df_base = dataloader.load_transform("../../data/raw/raw_trainset.csv")

In [5]:
df = dataloader.feature_extraction(df_base)

In [6]:
df = get_sample(df= df,n_ids=60)
df['species'] = df['species'].astype(str)


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# === 2. Sortieren und Feature-Auswahl ===
df = df.sort_values(["id", "time"])
feature_cols = ["b2", "b3", "b4", "b5", "b6", "b7", "b8", "b8a", "b11", "b12"]
label_col = "species"

# === 3. Gruppieren nach Standort (id) ===
sequences = []
labels = []

for id_, group in df.groupby("id"):
    if len(group) < 5:
        continue  # Überspringe zu kurze Zeitreihen
    X = group[feature_cols].values
    y = group[label_col].iloc[0]  # Annahme: eine Art pro id
    sequences.append(X)
    labels.append(y)

# === 4. Padding (vereinheitliche Länge) ===
max_seq_len = 10  # oder mehr, je nach deinen Daten
X_seq = pad_sequences(
    sequences, maxlen=max_seq_len, dtype="float32", padding="post", truncating="post"
)

# === 5. Labels zu numerisch → One-Hot ===
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)
y_onehot = to_categorical(y_encoded)  # shape = (n_samples, 7)

# === 6. Train/Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X_seq, y_onehot, test_size=0.2, random_state=42
)

# === 7. LSTM Modell ===
model = Sequential()
model.add(Masking(mask_value=0.0, input_shape=(max_seq_len, len(feature_cols))))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(32, activation="relu"))
model.add(Dense(y_onehot.shape[1], activation="softmax"))  # 7 Klassen

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# === 8. Modell trainieren ===
model.fit(X_train, y_train, epochs=50, batch_size=8, validation_data=(X_test, y_test))

# === 9. Evaluieren ===
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.2%}")


  super().__init__(**kwargs)


Epoch 1/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 89ms/step - accuracy: 0.2292 - loss: 2.0444 - val_accuracy: 0.3333 - val_loss: 1.8057
Epoch 2/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.2292 - loss: 1.8767 - val_accuracy: 0.3333 - val_loss: 1.7639
Epoch 3/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.2292 - loss: 1.8200 - val_accuracy: 0.3333 - val_loss: 1.7205
Epoch 4/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.2292 - loss: 1.7678 - val_accuracy: 0.3333 - val_loss: 1.6384
Epoch 5/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.2292 - loss: 1.7261 - val_accuracy: 0.1667 - val_loss: 1.6631
Epoch 6/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1250 - loss: 1.7193 - val_accuracy: 0.2500 - val_loss: 1.6464
Epoch 7/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.express as px

class SITS_PCA:
    def __init__(self, n_components=None, scale=True):
        """
        Parameters
        ----------
        n_components : int, float, or None
            Anzahl PCA-Komponenten oder Anteil erklärter Varianz (z. B. 0.95)
        scale : bool
            Ob die Features standardisiert werden sollen
        """
        self.n_components = n_components
        self.scale = scale
        self.scaler = None
        self.pca = None
        self.transformed_ = None
        self.explained_variance_ratio_ = None
        self.components_ = None
        self.feature_names_ = None
        self.labels_ = None

    def fit(self, df, label_col=None, band_prefix='b'):
        """
        Führt PCA auf den spektralen Bändern eines SITS-DataFrames aus.

        Parameters
        ----------
        df : pd.DataFrame
            DataFrame mit Spalten wie b2,b3,...,b12 und evtl. Metadaten.
        label_col : str, optional
            Spaltenname der Klassenlabels (z. B. 'species').
        band_prefix : str
            Präfix der Bandsäulen (Standard: 'b')
        """
        band_cols = [c for c in df.columns if c.startswith(band_prefix)]
        if not band_cols:
            raise ValueError(f"Keine Bänder mit Präfix '{band_prefix}' gefunden.")

        X = df[band_cols].values
        self.feature_names_ = band_cols

        if self.scale:
            self.scaler = StandardScaler()
            X = self.scaler.fit_transform(X)

        self.pca = PCA(n_components=self.n_components)
        self.transformed_ = self.pca.fit_transform(X)
        self.explained_variance_ratio_ = self.pca.explained_variance_ratio_
        self.components_ = self.pca.components_

        if label_col and label_col in df.columns:
            self.labels_ = df[label_col].values

        return self

    def plot_variance(self):
        """Interaktiver Plot der kumulativen erklärten Varianz."""
        if self.explained_variance_ratio_ is None:
            raise RuntimeError("Bitte zuerst fit() ausführen.")
        cumvar = np.cumsum(self.explained_variance_ratio_)
        fig = px.line(
            x=np.arange(1, len(cumvar) + 1),
            y=cumvar,
            markers=True,
            labels={"x": "Anzahl Komponenten", "y": "Kumulative erklärte Varianz"},
            title="PCA – erklärte Varianz"
        )
        fig.update_layout(yaxis_range=[0, 1])
        fig.show()

    def plot_2d(self):
        """Interaktiver 2D-Plot der ersten beiden PCA-Komponenten."""
        if self.transformed_ is None:
            raise RuntimeError("Bitte zuerst fit() ausführen.")
        df_plot = pd.DataFrame(self.transformed_[:, :2], columns=['PC1', 'PC2'])
        if self.labels_ is not None:
            df_plot['Label'] = self.labels_
        else:
            df_plot['Label'] = 'Sample'
        fig = px.scatter(
            df_plot,
            x='PC1',
            y='PC2',
            color='Label',
            title='SITS PCA 2D-Projektion',
            opacity=0.8
        )
        fig.show()

    def plot_3d(self):
        """Interaktiver 3D-Plot der ersten drei PCA-Komponenten."""
        if self.transformed_ is None:
            raise RuntimeError("Bitte zuerst fit() ausführen.")
        if self.transformed_.shape[1] < 3:
            raise ValueError("Mindestens 3 PCA-Komponenten nötig für 3D-Plot.")
        df_plot = pd.DataFrame(self.transformed_[:, :3], columns=['PC1', 'PC2', 'PC3'])
        if self.labels_ is not None:
            df_plot['Label'] = self.labels_
        else:
            df_plot['Label'] = 'Sample'
        fig = px.scatter_3d(
            df_plot,
            x='PC1',
            y='PC2',
            z='PC3',
            color='Label',
            title='SITS PCA 3D-Projektion',
            opacity=0.8,
        )
        fig.show()
    def show_components(self):
        """Zeigt die Gewichte (Loadings) der PCA-Komponenten."""
        if self.components_ is None or self.feature_names_ is None:
            raise RuntimeError("Bitte zuerst fit() ausführen.")
        loadings = pd.DataFrame(
            self.components_,
            columns=self.feature_names_,
            index=[f'PC{i+1}' for i in range(self.components_.shape[0])]
        )
        display(loadings.T.style.background_gradient(cmap='coolwarm'))
        return loadings


In [None]:
pca = SITS_PCA(n_components=0.95)

pca.fit(df, label_col='species')

pca.plot_variance()
pca.plot_2d()

In [None]:
pca.show_components()

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import umap
import plotly.express as px

class SITS_DimensionalityReduction:
    def __init__(self, method='PCA', n_components=2, scale=True, random_state=42, **kwargs):
        """
        Parameters
        ----------
        method : str
            Reduktionsmethode: 'PCA', 'tSNE' oder 'UMAP'
        n_components : int
            Anzahl der Dimensionen der Projektion
        scale : bool
            Ob die Features standardisiert werden sollen
        random_state : int
            Zufallsseed für Reproduzierbarkeit
        kwargs : dict
            Zusätzliche Parameter für t-SNE oder UMAP
        """
        self.method = method
        self.n_components = n_components
        self.scale = scale
        self.random_state = random_state
        self.kwargs = kwargs
        self.model = None
        self.transformed_ = None
        self.labels_ = None
        self.feature_names_ = None

    def fit(self, df, label_col=None, band_prefix='b'):
        band_cols = [c for c in df.columns if c.startswith(band_prefix)]
        if not band_cols:
            raise ValueError(f"Keine Bänder mit Präfix '{band_prefix}' gefunden.")
        
        X = df[band_cols].values
        self.feature_names_ = band_cols

        if self.scale:
            scaler = StandardScaler()
            X = scaler.fit_transform(X)

        if label_col and label_col in df.columns:
            self.labels_ = df[label_col].values

        if self.method.upper() == 'PCA':
            self.model = PCA(n_components=self.n_components, random_state=self.random_state)
            self.transformed_ = self.model.fit_transform(X)
        elif self.method.upper() == 'TSNE':
            self.model = TSNE(n_components=self.n_components, random_state=self.random_state, **self.kwargs)
            self.transformed_ = self.model.fit_transform(X)
        elif self.method.upper() == 'UMAP':
            self.model = umap.UMAP(n_components=self.n_components, random_state=self.random_state, **self.kwargs)
            self.transformed_ = self.model.fit_transform(X)
        else:
            raise ValueError("Methode muss 'PCA', 'tSNE' oder 'UMAP' sein.")

        return self

    def plot_2d(self):
        if self.transformed_ is None:
            raise RuntimeError("Bitte zuerst fit() ausführen.")
        if self.transformed_.shape[1] < 2:
            raise ValueError("Mindestens 2 Komponenten nötig für 2D-Plot.")
        
        df_plot = pd.DataFrame(self.transformed_[:, :2], columns=['Dim1', 'Dim2'])
        df_plot['Label'] = self.labels_ if self.labels_ is not None else 'Sample'
        fig = px.scatter(df_plot, x='Dim1', y='Dim2', color='Label', title=f'{self.method} 2D-Projektion', opacity=0.8)
        fig.show()

    def plot_3d(self):
        if self.transformed_ is None:
            raise RuntimeError("Bitte zuerst fit() ausführen.")
        if self.transformed_.shape[1] < 3:
            raise ValueError("Mindestens 3 Komponenten nötig für 3D-Plot.")
        
        df_plot = pd.DataFrame(self.transformed_[:, :3], columns=['Dim1', 'Dim2', 'Dim3'])
        df_plot['Label'] = self.labels_ if self.labels_ is not None else 'Sample'
        fig = px.scatter_3d(df_plot, x='Dim1', y='Dim2', z='Dim3', color='Label', title=f'{self.method} 3D-Projektion', opacity=0.8)
        fig.show()


In [None]:
# PCA Beispiel
dr = SITS_DimensionalityReduction(method='PCA', n_components=3)
dr.fit(df, label_col='species')
dr.plot_2d()

# t-SNE Beispiel
dr_tsne = SITS_DimensionalityReduction(method='tSNE', n_components=2, perplexity=30)
dr_tsne.fit(df, label_col='species')
dr_tsne.plot_2d()

# UMAP Beispiel
dr_umap = SITS_DimensionalityReduction(method='UMAP', n_components=2, n_neighbors=15, min_dist=0.1)
dr_umap.fit(df, label_col='species')
dr_umap.plot_2d()


In [None]:
a

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# ---------------------------------------------------------
# 1. Dummy-Daten (ersetze später durch echte SITS-Daten)
# ---------------------------------------------------------
n_samples = 5000       # Anzahl Beobachtungen
timesteps = 12         # z. B. 12 Monate
n_features = 10        # z. B. 10 spektrale Bänder
n_classes = 5          # z. B. 5 Landnutzungsklassen

X = np.random.rand(n_samples, timesteps, n_features)
y = np.random.randint(0, n_classes, size=(n_samples,))

# Optional: One-hot-Encoding
y_cat = to_categorical(y, num_classes=n_classes)

# ---------------------------------------------------------
# 2. Train/Test Split
# ---------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.2, random_state=42)

# ---------------------------------------------------------
# 3. (Optional) Skalierung über Features
# ---------------------------------------------------------
# Skaliere über Features pro Zeitstempel
scaler = StandardScaler()
X_train_scaled = X_train.reshape(-1, n_features)
X_train_scaled = scaler.fit_transform(X_train_scaled)
X_train_scaled = X_train_scaled.reshape(-1, timesteps, n_features)

X_test_scaled = X_test.reshape(-1, n_features)
X_test_scaled = scaler.transform(X_test_scaled)
X_test_scaled = X_test_scaled.reshape(-1, timesteps, n_features)

# ---------------------------------------------------------
# 4. Modellaufbau
# ---------------------------------------------------------
model = Sequential([
    LSTM(64, input_shape=(timesteps, n_features), return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(n_classes, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# ---------------------------------------------------------
# 5. Training
# ---------------------------------------------------------
history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_test_scaled, y_test),
    epochs=20,
    batch_size=64
)

# ---------------------------------------------------------
# 6. Evaluation
# ---------------------------------------------------------
test_loss, test_acc = model.evaluate(X_test_scaled, y_test)
print(f"Test Accuracy: {test_acc:.3f}")


In [None]:
pip install tensorflow

# alt

In [None]:
df_base.shape

In [None]:
df_base = df_base.drop_duplicates()

In [None]:
df_base[df_base["id"] == 16404].sort_values(by="time")

In [None]:
df[df["id"] == 16404].sort_values(by="time")

## Basic Data Analysis

In [None]:
basicanalysis = BasicDataAnalysis(df_base)
basicanalysis.get_dtypes()

In [None]:
print(f"Raw Dataset cols:{basicanalysis.get_num_cols()}")
print(f"Raw Dataset rows:{basicanalysis.get_num_rows()}")

In [None]:
basicanalysis.get_desricption()

In [None]:
basicanalysis.get_missing_counts()

## Feature Engineering

### Basic Feature Engineering

In [None]:
df = dataloader.feature_extraction(df_base)

### Datetime Feature Engineering

In [None]:
df = dataloader.date_feature_extraction(df)

### Advanced Feature Engineering

In [None]:
calcindices = CalculateIndices()
df = calcindices.add_all_indices(df)

## Exploratory Data Analysis

### Visualization Base-Dataframe 

In [None]:
hist = HistogramDataVisualization(df)
hist.plot_unique_ids("year")

In [None]:
hist.plot_unique_ids("month_num")

In [None]:
hist.plot_unique_ids("species")

In [None]:
hist.plot_median_id_distribution()

## Correlation Analysis

In [None]:
correlation = CorrelationAnalysis(df)
corr_matrix = correlation.get_correlation_matrix()
corr_matrix

In [None]:
correlation.plot_correlation_matrix()

In [None]:
top_corr_df = correlation.get_top_correlations(top_n=15)
plot_top_correlations(top_corr_df)

In [None]:
# correlation.plot_correlation_distribution(sample_size=100)

## Disturbed

In [None]:
unique_values = df["disturbance_year"].unique()
print(f"Nunique: {len(unique_values)}")
print(f"Unique Values:\n{unique_values}")

In [None]:
dist_disturbance_df = df["is_disturbed"].value_counts().reset_index()
dist_disturbance_df.columns = ["is_disturbed", "count"]


plt.figure(figsize=(8, 5))
plt.bar(dist_disturbance_df["is_disturbed"].astype(str), dist_disturbance_df["count"])
plt.xlabel("Disturbed")
plt.ylabel("Count")
plt.title("Comparison of distrubed values")
plt.grid()
plt.show()

In [None]:
filtered = df[df["disturbance_year"] != 0]
crosstab = pd.crosstab(filtered["disturbance_year"], filtered["species"])
crosstab.plot(kind="bar", stacked=True, figsize=(10, 6))

plt.xlabel("Disturbance Year")
plt.ylabel("Anzahl")
plt.title("Distribution of Disturbance Year by Species")
plt.legend(title="Species")
plt.tight_layout()
plt.grid()
plt.show()

In [None]:
test = df[df["is_disturbed"]].copy()
test["disturbance_year_diff"] = test["disturbance_year"] - test["year"]
test

In [None]:
values = test["disturbance_year_diff"].dropna()

fig = plt.figure(figsize=(10, 6))
grid = fig.add_gridspec(2, 1, height_ratios=[1, 4], hspace=0.05)
ax_box = fig.add_subplot(grid[0, 0])
ax_box.boxplot(values, vert=False, patch_artist=True)
ax_box.set(xticks=[], xlabel="")
ax_box.set_yticks([])
ax_box.set_title("Distribution of Disturbance Year Differences")
ax_hist = fig.add_subplot(grid[1, 0])
ax_hist.hist(values, bins=30, alpha=0.7, edgecolor="black")
ax_hist.set_xlabel("disturbance_year_diff")
ax_hist.set_ylabel("Frequency")
ax_hist.grid()
plt.show()

In [None]:
filtered.sort_values(by="disturbance_year", ascending=True)

In [None]:
filtered = df[df["disturbance_year"] != 0]
id_df = filtered[filtered["id"] == 11759]
id_df

In [None]:
band_columns = [col for col in id_df.columns if col.startswith("b")]
fig = px.line(id_df, x="time", y=band_columns, markers=":")
fig.show()

## Explore Spectral

In [None]:
spectral = SpectralBandPlotter(df)
spectral.plot_all_years(sample_size=500, showfliers=True)

In [None]:
spectral.plot_per_year(sample_size=500)

In [None]:
spectral.plot_species_season_distribution()

## Time Series Analysis

In [None]:
plot_date_diff_distribution(df)

In [None]:
ts_agg = TimeSeriesAggregate(id_df)
df_2w = ts_agg.aggregate_timeseries(freq="2W", method="median")
dataloader = DataLoader()
df_2w_features = dataloader.date_feature_extraction(df_2w)

In [None]:
spectral = SpectralBandPlotter(df_2w_features)
spectral.plot_spectral_development_over_years(addition="aggregated")

In [None]:
plot_intervals_timestamps(df_2w_features, addition="aggregated")

### Autocorrelation

In [None]:
plot_autocorrelation(df_2w_features, "ndvi")

In [None]:
plot_band_differences(df_2w_features)

# Anomaly Detection

In [None]:
df_sample = get_sample(df, n_ids=40)

cleaner = SITSOutlierCleaner()
cleaner.fit_transform(df_sample, band_columns=spectral_bands)
df_with_any_flag = cleaner.add_any_outlier_flag()
id_df = df_with_any_flag[df_with_any_flag["id"] == 24]
df_interpolated = cleaner.get_interpolated_only()

In [None]:
plot_with_outliers_subplot(id_df, spectral_bands)

In [None]:
plot_outlier_detection_grid(id_df, bands=spectral_bands)

In [None]:
df_interpolated