## Fragen

1. Analyse Allgemein
    - Lässt sich ein Zusammenhang zwischen dem Wasserpegel von den Beobachtungsdaten und den Modelldaten erkennen?
    - Wie ist die Korrelation zwischen Wasserpegel Model und Wasserpegel Beobachtung

2. Analyse Sturmfluten
    - Wie verhält sich Wind, Windrichtung bei den unterschiedlichen Sturmfluten
    - Wie vehält sich Wassergeschwindigkeit, Richtung bei den unterschiedlichen Sturmfluten
    - Lassen sich Korrelationen zwischen den Features und dem Wasserpegel (sla) erkennen? 

## Import Libaries

In [None]:
# import all necessary libraries
import os
import warnings
from pathlib import Path

import cartopy.feature as cfeature
import geodatasets
import geopandas as gpd
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shapely.geometry
import xarray as xr
from joblib import Parallel, delayed
from mpl_toolkits.basemap import Basemap
from scipy.interpolate import griddata
from statsmodels.graphics.tsaplots import plot_acf
from tqdm import tqdm
from utils.eda_helper_functions import (
    check_missing_times,
    group_data_hourly,
    load_insitu_data,
    load_ocean_data,
    load_weather_data,
    plot_water_level_anomalies,
    process_df,
    process_flensburg_data,
    show_df,
)

# Ignore SettingWithCopyWarning:
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

# Display all columns
pd.options.display.max_columns = None


# Global variables
# Define the grid size of the ocean and weather data
OCEAN_POINTS = 30
WEATHER_POINTS = 10

LAT_FLENSBURG = 54.796001
LON_FLENSBURG = 9.436999

# Definition of the ocean data dictionary
OCEAN_DICT = {
    "bottomT": {
        "unit": "°C",
        "description": "Sea water potential temperature at sea floor",
        "explanation": "Temperature of seawater at the ocean floor, accounting for pressure effects."
    },
    "mlotst": {
        "unit": "m",
        "description": "Ocean mixed layer thickness defined by sigma theta",
        "explanation": "Depth of the ocean's surface layer where temperature and salinity are relatively uniform."
    },
    "siconc": {
        "unit": "-",
        "description": "Sea ice area fraction",
        "explanation": "Fractional coverage of sea ice in a given area (0 = no ice, 1 = full coverage)."
    },
    "sithick": {
        "unit": "m",
        "description": "Sea ice thickness",
        "explanation": "Thickness of sea ice from surface to bottom."
    },
    "sla": {
        "unit": "m",
        "description": "Sea surface height above sea level",
        "explanation": "Deviation of the ocean surface from the mean sea level, can indicate currents or tides."
    },
    "so": {
        "unit": "$1 / 10^3$",
        "description": "Sea water salinity",
        "explanation": "Salinity of seawater (measured dimensionless, typically expressed in parts per thousand or PSU)."
    },
    "sob": {
        "unit": "$1 / 10^3$",
        "description": "Sea water salinity at sea floor",
        "explanation": "Salinity of seawater at the ocean floor, normalized (0.001 units)."
    },
    "thetao": {
        "unit": "°C",
        "description": "Sea water potential temperature",
        "explanation": "Potential temperature of seawater, referenced to sea surface pressure."
    },
    "uo": {
        "unit": "m/s",
        "description": "Eastward sea water velocity",
        "explanation": "Velocity component of seawater flow towards the east."
    },
    "vo": {
        "unit": "m/s",
        "description": "Northward sea water velocity",
        "explanation": "Velocity component of seawater flow towards the north."
    },
    "wo": {
        "unit": "m/s",
        "description": "Upward sea water velocity",
        "explanation": "Vertical velocity of seawater, positive upward."
    }
}

# Definition of the weather data dictionary
WEATHER_DICT = {
    "temperature_2m": {
        "unit": "°C",
        "description": "Temperature (2 m)",
        "explanation": "Air temperature at 2 meters above ground."
    },
    "relative_humidity_2m": {
        "unit": "%",
        "description": "Relative Humidity (2 m)",
        "explanation": "Percentage of humidity at 2 meters height."
    },
    "dew_point_2m": {
        "unit": "°C",
        "description": "Dewpoint (2 m)",
        "explanation": "Temperature at which air moisture condenses (dew point) at 2 meters height."
    },
    "apparent_temperature": {
        "unit": "°C",
        "description": "Apparent Temperature",
        "explanation": "Perceived temperature considering wind and humidity."
    },
    "precipitation_probability": {
        "unit": "%",
        "description": "Precipitation Probability",
        "explanation": "Probability of precipitation."
    },
    "precipitation": {
        "unit": "mm",
        "description": "Precipitation (rain + showers + snow)",
        "explanation": "Total precipitation amount (rain, showers, snow)."
    },
    "rain": {
        "unit": "mm",
        "description": "Rain",
        "explanation": "Precipitation amount due to rain."
    },
    "showers": {
        "unit": "mm",
        "description": "Showers",
        "explanation": "Precipitation amount due to showers."
    },
    "snowfall": {
        "unit": "cm",
        "description": "Snowfall",
        "explanation": "Precipitation amount due to snow."
    },
    "snow_depth": {
        "unit": "cm",
        "description": "Snow Depth",
        "explanation": "Total snow depth on the ground."
    },
    "weather_code": {
        "unit": "-",
        "description": "Weather code",
        "explanation": "Classification of weather conditions by a code (e.g., sunny, cloudy)."
    },
    "pressure_msl": {
        "unit": "hPa",
        "description": "Sealevel Pressure",
        "explanation": "Atmospheric pressure reduced to sea level."
    },
    "surface_pressure": {
        "unit": "hPa",
        "description": "Surface Pressure",
        "explanation": "Actual atmospheric pressure at the surface."
    },
    "cloud_cover": {
        "unit": "%",
        "description": "Cloud cover Total",
        "explanation": "Total cloud coverage."
    },
    "cloud_cover_low": {
        "unit": "%",
        "description": "Cloud cover Low",
        "explanation": "Cloud coverage by low-level clouds."
    },
    "cloud_cover_mid": {
        "unit": "%",
        "description": "Cloud cover Mid",
        "explanation": "Cloud coverage by mid-level clouds."
    },
    "cloud_cover_high": {
        "unit": "%",
        "description": "Cloud cover High",
        "explanation": "Cloud coverage by high-level clouds."
    },
    "visibility": {
        "unit": "m",
        "description": "Visibility",
        "explanation": "Visibility distance."
    },
    "evapotranspiration": {
        "unit": "mm",
        "description": "Evapotranspiration",
        "explanation": "Water loss through evaporation and plant transpiration."
    },
    "et0_fao_evapotranspiration": {
        "unit": "mm",
        "description": "Reference Evapotranspiration (ET₀)",
        "explanation": "Standardized reference evapotranspiration according to FAO."
    },
    "vapour_pressure_deficit": {
        "unit": "hPa",
        "description": "Vapour Pressure Deficit",
        "explanation": "Difference between saturation and actual vapor pressure."
    },
    "wind_speed_10m": {
        "unit": "km/h",
        "description": "Wind Speed (10 m)",
        "explanation": "Wind speed at 10 meters above ground."
    },
    "wind_speed_80m": {
        "unit": "km/h",
        "description": "Wind Speed (80 m)",
        "explanation": "Wind speed at 80 meters above ground."
    },
    "wind_speed_120m": {
        "unit": "km/h",
        "description": "Wind Speed (120 m)",
        "explanation": "Wind speed at 120 meters above ground."
    },
    "wind_speed_180m": {
        "unit": "km/h",
        "description": "Wind Speed (180 m)",
        "explanation": "Wind speed at 180 meters above ground."
    },
    "wind_direction_10m": {
        "unit": "°",
        "description": "Wind Direction (10 m)",
        "explanation": "Wind direction in degrees at 10 meters height (0° = North)."
    },
    "wind_direction_80m": {
        "unit": "°",
        "description": "Wind Direction (80 m)",
        "explanation": "Wind direction in degrees at 80 meters height."
    },
    "wind_direction_120m": {
        "unit": "°",
        "description": "Wind Direction (120 m)",
        "explanation": "Wind direction in degrees at 120 meters height."
    },
    "wind_direction_180m": {
        "unit": "°",
        "description": "Wind Direction (180 m)",
        "explanation": "Wind direction in degrees at 180 meters height."
    },
    "wind_gusts_10m": {
        "unit": "km/h",
        "description": "Wind Gusts (10 m)",
        "explanation": "Maximum gust wind speed at 10 meters height."
    },
    "temperature_80m": {
        "unit": "°C",
        "description": "Temperature (80 m)",
        "explanation": "Air temperature at 80 meters above ground."
    },
    "temperature_120m": {
        "unit": "°C",
        "description": "Temperature (120 m)",
        "explanation": "Air temperature at 120 meters above ground."
    },
    "temperature_180m": {
        "unit": "°C",
        "description": "Temperature (180 m)",
        "explanation": "Air temperature at 180 meters above ground."
    }
}



ocean_data_path = Path(f"../data/numerical_data/points{OCEAN_POINTS}")
print(ocean_data_path)
weather_data_path = Path(f"../data/numerical_data/points{WEATHER_POINTS}")
print(weather_data_path)

# Load The Data

## Ocean Data

| Feature | Unit | Description | Explanation |
|:---|:---|:---|:---|
| bottomT | °C | Sea water potential temperature at sea floor | Temperature of seawater at the ocean floor, accounting for pressure effects. |
| mlotst | m | Ocean mixed layer thickness defined by sigma theta | Depth of the ocean's surface layer where temperature and salinity are relatively uniform. |
| siconc | - | Sea ice area fraction | Fractional coverage of sea ice in a given area (0 = no ice, 1 = full coverage). |
| sithick | m | Sea ice thickness | Thickness of sea ice from surface to bottom. |
| sla | m | Sea surface height above sea level | Deviation of the ocean surface from the mean sea level, can indicate currents or tides. |
| so | $1 / 10^3$ | Sea water salinity | Salinity of seawater (measured dimensionless, typically expressed in parts per thousand or PSU). |
| sob | $1 / 10^3$| Sea water salinity at sea floor | Salinity of seawater at the ocean floor, normalized (0.001 units). |
| thetao | °C | Sea water potential temperature | Potential temperature of seawater, referenced to sea surface pressure. |
| uo | m/s | Eastward sea water velocity | Velocity component of seawater flow towards the east. |
| vo | m/s | Northward sea water velocity | Velocity component of seawater flow towards the north. |
| wo | m/s | Upward sea water velocity | Vertical velocity of seawater, positive upward. |


In [None]:
df_ocean = load_ocean_data(ocean_data_path, OCEAN_POINTS, verbose=True)

In [None]:
df_ocean = process_df(df_ocean, drop_cols=["depth"], verbose=True)

## Weather Data

| Feature | Unit | Description | Explanation |
|:---|:---|:---|:---|
| temperature_2m | °C | Temperature (2 m) | Air temperature at 2 meters above ground. |
| relative_humidity_2m | % | Relative Humidity (2 m) | Percentage of humidity at 2 meters height. |
| dew_point_2m | °C | Dewpoint (2 m) | Temperature at which air moisture condenses (dew point) at 2 meters height. |
| apparent_temperature | °C | Apparent Temperature | Perceived temperature considering wind and humidity. |
| precipitation_probability | % | Precipitation Probability | Probability of precipitation. |
| precipitation | mm | Precipitation (rain + showers + snow) | Total precipitation amount (rain, showers, snow). |
| rain | mm | Rain | Precipitation amount due to rain. |
| showers | mm | Showers | Precipitation amount due to showers. |
| snowfall | cm | Snowfall | Precipitation amount due to snow. |
| snow_depth | cm | Snow Depth | Total snow depth on the ground. |
| weather_code | - | Weather code | Classification of weather conditions by a code (e.g., sunny, cloudy). |
| pressure_msl | hPa | Sealevel Pressure | Atmospheric pressure reduced to sea level. |
| surface_pressure | hPa | Surface Pressure | Actual atmospheric pressure at the surface. |
| cloud_cover | % | Cloud cover Total | Total cloud coverage. |
| cloud_cover_low | % | Cloud cover Low | Cloud coverage by low-level clouds. |
| cloud_cover_mid | % | Cloud cover Mid | Cloud coverage by mid-level clouds. |
| cloud_cover_high | % | Cloud cover High | Cloud coverage by high-level clouds. |
| visibility | m | Visibility | Visibility distance. |
| evapotranspiration | mm | Evapotranspiration | Water loss through evaporation and plant transpiration. |
| et0_fao_evapotranspiration | mm | Reference Evapotranspiration (ET₀) | Standardized reference evapotranspiration according to FAO. |
| vapour_pressure_deficit | hPa | Vapour Pressure Deficit | Difference between saturation and actual vapor pressure. |
| wind_speed_10m | km/h | Wind Speed (10 m) | Wind speed at 10 meters above ground. |
| wind_speed_80m | km/h | Wind Speed (80 m) | Wind speed at 80 meters above ground. |
| wind_speed_120m | km/h | Wind Speed (120 m) | Wind speed at 120 meters above ground. |
| wind_speed_180m | km/h | Wind Speed (180 m) | Wind speed at 180 meters above ground. |
| wind_direction_10m | ° | Wind Direction (10 m) | Wind direction in degrees at 10 meters height (0° = North). |
| wind_direction_80m | ° | Wind Direction (80 m) | Wind direction in degrees at 80 meters height. |
| wind_direction_120m | ° | Wind Direction (120 m) | Wind direction in degrees at 120 meters height. |
| wind_direction_180m | ° | Wind Direction (180 m) | Wind direction in degrees at 180 meters height. |
| wind_gusts_10m | km/h | Wind Gusts (10 m) | Maximum gust wind speed at 10 meters height. |
| temperature_80m | °C | Temperature (80 m) | Air temperature at 80 meters above ground. |
| temperature_120m | °C | Temperature (120 m) | Air temperature at 120 meters above ground. |
| temperature_180m | °C | Temperature (180 m) | Air temperature at 180 meters above ground. |


In [None]:
df_weather = load_weather_data(weather_data_path, WEATHER_POINTS, verbose=True)

In [None]:
df_weather = process_df(df_weather, verbose=True)

## In Situ Data

| Feature   | Unit    | Description                     | Explanation                                                                 |
|:----------|:--------|:---------------------------------|:---------------------------------------------------------------------------|
| time      | -       | Timestamp                        | Date and time of the observation (UTC).                                    |
| depth     | m       | Measurement depth                | Depth below sea surface where the measurement was taken.                   |
| time_qc   | -       | Time quality control flag        | Quality control indicator for the timestamp (e.g., 1 = good).              |
| deph      | m       | Nominal depth                    | Nominal (intended) depth of the measurement, could differ from actual depth.|
| latitude  | degrees | Latitude                         | Geographic coordinate specifying north-south position.                     |
| longitude | degrees | Longitude                        | Geographic coordinate specifying east-west position.                       |
| slev      | m       | Sea level                        | Measured sea surface height relative to a reference Datum |
| slev_qc   | -       | Sea level quality control flag   | Quality control indicator for sea level measurement (e.g., 1 = good).       |


In [None]:
from utils.eda_helper_functions import load_insitu_data

df_insitu = load_insitu_data(verbose=True)

In [None]:
from utils.eda_helper_functions import process_flensburg_data

df_insitu = process_flensburg_data(df_insitu, 
                                      start_time=df_ocean['time'].min(),
                                      end_time=df_ocean['time'].max(),
                                      verbose=True)

In [None]:
from utils.eda_helper_functions import group_data_hourly

df_insitu = group_data_hourly(df_insitu)
df_insitu = process_df(df_insitu, drop_cols=["deph"], verbose=True)


# EDA

### Plot Flensburg Observation Waterlevel Data

In [None]:
from utils.eda_helper_functions import plot_water_level_anomalies

fig, ax = plot_water_level_anomalies(df_insitu)
plt.show()

import datetime

sturm_surge_list = [datetime.datetime(2023, 2, 25, 17, 0),
                    datetime.datetime(2023, 4, 1, 12, 0),
                    datetime.datetime(2023, 10, 7, 20, 0),
                    datetime.datetime(2023, 10, 20, 0, 0),
                    datetime.datetime(2024, 1, 3, 9, 0),
                    datetime.datetime(2024, 2, 9, 18, 0),
                    datetime.datetime(2024, 12, 9, 16, 0),
                    ]

for time in sturm_surge_list:
    start_time = time - datetime.timedelta(days=2)
    end_time = time + datetime.timedelta(days=2)
    df_insitu_sturm = df_insitu[(df_insitu["time"] >= start_time) & (df_insitu["time"] <= end_time)]
    plot_water_level_anomalies(df_insitu_sturm, start_date=start_time, end_date=end_time)
    

## Display Ocean and Weather Data

In [None]:
from utils.eda_helper_functions import plot_coordinates

plot_coordinates(df_ocean, df_weather, df_insitu, save_png=False)

## Distributions of the Features

In [None]:
from utils.eda_helper_functions import plot_histogram

def plot_feature_distribution(df:pd.DataFrame, features:list, bins:int=50, save_png:bool=False):
    """
    Plots the distribution of features in a DataFrame.
    
    Args:
        df (pd.DataFrame): DataFrame containing the features.
        features (list): List of feature names to plot.
        bins (int): Number of bins for the histogram.
        save_png (bool): Whether to save the plot as a PNG file.
    """
    n_cols = 3
    n_rows = (len(features) + n_cols - 1) // n_cols

    fig = plt.figure(figsize=(8 * n_cols, 5 * n_rows))
    gs = gridspec.GridSpec(n_rows, n_cols, figure=fig)

    fig.suptitle("Feature Distribution", fontsize=20, y=0.98)

    for idx, feature in tqdm(enumerate(features), total=len(features), desc="Plotting features"):
        #print(f"Plotting distribution for {feature}")
        ax = fig.add_subplot(gs[idx])
        plot_histogram(df, column=feature, bins=bins, ax=ax, show_stats=True)

    # Statt tight_layout --> subplots_adjust
    fig.subplots_adjust(top=0.92, hspace=0.5, wspace=0.3)  # <-- manuell fein justieren!
    if save_png:
        plt.savefig(f"../figures/distribution_{feature}.png", dpi=300, bbox_inches='tight')
    plt.show()




In [None]:
print(df_insitu.info())
print(df_insitu.describe())

In [None]:
plot_feature_distribution(df_ocean, df_ocean.columns, bins=50, save_png=False)

In [None]:
plot_feature_distribution(df_weather, df_weather.columns, bins=50, save_png=False)



In [None]:
plot_feature_distribution(df_insitu, df_insitu.columns, bins=50, save_png=False)

In [None]:
dd

## Cluster df_ocean into K = 3

In [None]:
df_ocean = load_ocean_data(ocean_data_path, OCEAN_POINTS, verbose=False)
df_ocean = process_df(df_ocean, drop_cols=["depth"], verbose=False)

df_weather = load_weather_data(weather_data_path, WEATHER_POINTS, verbose=False)
df_weather = process_df(df_weather, verbose=False)

df_insitu = load_insitu_data(verbose=False)
df_insitu = process_flensburg_data(df_insitu, 
                                      start_time=df_ocean['time'].min(),
                                      end_time=df_ocean['time'].max(),
                                      verbose=False)

df_insitu = group_data_hourly(df_insitu)
df_insitu = process_df(df_insitu, drop_cols=["deph"], verbose=False)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

# === 1. Features auswählen (außer Zeit, da KMeans keine Zeit versteht)
features = ['latitude', 'longitude', 'sla', ]
X = df_ocean[features].dropna()

# === 2. Standardisieren
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === 3. KMeans-Clustering (k=3)
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# === 4. Cluster-Labels zurück ins DataFrame
df_clustered = df_ocean.loc[X.index].copy()
df_clustered['cluster'] = clusters

# === 5. Mittelpunkt berechnen für Basemap
mean_lat = df_clustered['latitude'].mean()
mean_lon = df_clustered['longitude'].mean()

# === 6. Karte mit Basemap zeichnen
plt.figure(figsize=(10, 8))
m = Basemap(
    projection='lcc',
    resolution='i',
    lat_0=mean_lat,
    lon_0=mean_lon,
    width=1.2e6,
    height=1.2e6,
)

m.drawcoastlines()
m.drawcountries()
m.drawmapboundary(fill_color='lightblue')
m.fillcontinents(color='beige', lake_color='lightblue')

# Farben definieren
colors = ['red', 'green', 'blue']

# === 7. Punkte plotten
for cluster_id in range(3):
    cluster_data = df_clustered[df_clustered['cluster'] == cluster_id]
    x, y = m(cluster_data['longitude'].values, cluster_data['latitude'].values)
    m.scatter(x, y, s=10, c=colors[cluster_id], label=f'Cluster {cluster_id}', alpha=0.6)

plt.legend(loc='upper left')
plt.title('KMeans Clustering (k=3) der Ozeandaten')
plt.show()


## Comparison of SLEV and SLA



In [None]:
df_ocean = load_ocean_data(ocean_data_path, OCEAN_POINTS, verbose=False)
df_ocean = process_df(df_ocean, drop_cols=["depth"], verbose=False)

df_weather = load_weather_data(weather_data_path, WEATHER_POINTS, verbose=False)
df_weather = process_df(df_weather, verbose=False)

df_insitu = load_insitu_data(verbose=False)
df_insitu = process_flensburg_data(df_insitu, 
                                      start_time=df_ocean['time'].min(),
                                      end_time=df_ocean['time'].max(),
                                      verbose=False)

df_insitu = group_data_hourly(df_insitu)
df_insitu = process_df(df_insitu, drop_cols=["deph"], verbose=False)

In [None]:
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
import pandas as pd


# find the closest location in df_ocean to the target location
def find_closest_location(df: pd.DataFrame, target_lat: float, target_lon: float) -> pd.Series:
    """
    Find the closest location in the DataFrame to the target latitude and longitude.

    Parameters:
        df (pd.DataFrame): DataFrame containing the data with 'latitude' and 'longitude'.
        target_lat (float): Target latitude.
        target_lon (float): Target longitude.

    Returns:
        pd.Series: The row of the closest location.
    """
    df["distance"] = np.sqrt((df["latitude"] - target_lat) ** 2 + (df["longitude"] - target_lon) ** 2)
    return df.loc[df["distance"].idxmin()]


In [None]:
import matplotlib.pyplot as plt
import pandas as pd


def plot_closest_location(df_ocean: pd.DataFrame, target_lat: float, target_lon: float) -> None:
    """
    Plot the closest location in ocean data to the target latitude and longitude,
    with a basemap background using Basemap.
    """

    closest_location = find_closest_location(df_ocean, target_lat, target_lon)

    # Mittelwerte für Kartenzentrum
    mean_lat = df_ocean["latitude"].mean()
    mean_lon = df_ocean["longitude"].mean()

    plt.figure(figsize=(12, 10))

    # Erstelle Basemap
    m = Basemap(
        projection='lcc',
        resolution='i',
        lat_0=mean_lat,
        lon_0=mean_lon,
        width=1.2e6,
        height=1.2e6,
    )

    # Kartenelemente zeichnen
    m.drawcoastlines()
    m.drawcountries()
    m.drawstates()
    m.drawmapboundary(fill_color="lightblue")
    m.fillcontinents(color="lightgray", lake_color="lightblue")

    # Koordinaten konvertieren (long, lat) → Karten-Koordinaten (x, y)
    x_ocean, y_ocean = m(df_ocean["longitude"].values, df_ocean["latitude"].values)
    x_target, y_target = m(target_lon, target_lat)
    x_closest, y_closest = m(closest_location.longitude, closest_location.latitude)

    # Punkte plotten
    m.scatter(x_ocean, y_ocean, color="blue", label="Ocean Data", s=10)
    m.scatter(x_target, y_target, color="#0072B2", marker="*", label="Target Point", s=200)
    m.scatter(x_closest, y_closest, color="#E69F00", marker="o", label="Closest Point", s=50)

    plt.title("Closest Location in Ocean Data", fontsize=14)
    plt.legend(loc="upper right")
    m.drawparallels(np.arange(-360, 360, 1), labels=[1, 0, 0, 0], fontsize=10)
    m.drawmeridians(np.arange(-360, 360, 1), labels=[0, 0, 0, 1], fontsize=10)
    plt.show()

target_lat = 54.5
target_lon = 10.0
plot_closest_location(df_ocean, target_lat, target_lon)

In [None]:
def comparison_slev_sla(df_ocean: pd.DataFrame, df_insitu: pd.DataFrame, target_lat: float = 54.5, target_lon: float = 10.0) -> None:
    """
    Compare the SLEV and SLA data by plotting them on the same graph.
    """

    closest_location = find_closest_location(df_ocean, target_lat, target_lon)
    # Filter the data for the closest location
    df_ocean_target = df_ocean[
        (df_ocean["latitude"] == closest_location["latitude"])
        & (df_ocean["longitude"] == closest_location["longitude"])
    ].reset_index(drop=True)


    # Calculating Pearson correlation of SLEV and SLA
    df_corr_ocean = df_ocean_target.copy()
    df_corr_insitu = df_insitu.copy()

    df_corr_ocean.index = pd.to_datetime(df_corr_ocean.index)
    df_corr_insitu.index = pd.to_datetime(df_corr_insitu.index)
    df_corr = pd.merge(df_corr_ocean[['sla']], df_corr_insitu[['slev']], left_index=True, right_index=True, how='inner')
    corr = df_corr['slev'].corr(df_corr['sla'])
    rmse = np.sqrt(np.mean((df_corr['slev'] - df_corr['sla']) ** 2))

    # Plotting SLEV and SLA
    alpha = 0.8
    window_size = 24

    rolling_mean_slev = df_insitu["slev"].rolling(window=window_size).mean()
    rolling_mean_sla = df_ocean_target["sla"].rolling(window=window_size).mean()
    plt.figure(figsize=(12, 6))
    plt.plot(df_insitu["time"], df_insitu["slev"], label="water level Flensburg (SLEV)", color="#0072B2", alpha=0.1)
    plt.plot(df_ocean["time"], df_ocean["sla"], label="water level - closest point (SLA)", color="#E69F00", alpha=0.1)
    plt.plot(df_insitu["time"], rolling_mean_slev, label="water level Flensburg (SLEV) Rolling Mean", color="#0072B2", linestyle="-", alpha=alpha)
    plt.plot(df_ocean_target["time"], rolling_mean_sla, label="water level - closest point (SLA) Rolling Mean", color="#E69F00", linestyle="-", alpha=alpha)
    # plot text with the correlation value

    plt.title(f"Comparison of SLEV and SLA \n Correlation: {corr:.2f}, RMSE: {rmse:.2f}", fontsize=14)
    plt.xlabel("time")
    plt.ylabel("water level [m]")
    plt.legend()
    plt.grid(True)
    plt.show()


    # calcualte autocorrelation for SLEV and SLA for different lags
    
    df_corr_insitu = df_corr_insitu.loc[(df_corr_insitu['time'] >= df_corr_ocean['time'].min()) & (df_corr_insitu['time'] <= df_corr_ocean['time'].max())]
    fig, ax = plt.subplots(2, 1, figsize=(12, 8))
    plot_acf(df_insitu["slev"], lags=100, ax=ax[0])
    ax[0].set_title("Autocorrelation of SLEV")
    plot_acf(df_ocean_target["sla"], lags=100, ax=ax[1])
    ax[1].set_title("Autocorrelation of SLA")
    plt.tight_layout()
    plt.show()


comparison_slev_sla(df_ocean=df_ocean, df_insitu=df_insitu, target_lat=target_lat, target_lon=target_lon)

In [None]:
dd

In [None]:
from statsmodels.tsa.stattools import acf, pacf

n_lags = 100
diff_series=df_insitu['slev'].diff(2)[2:]
lag_acf=acf(diff_series, nlags=n_lags)
lag_pacf=pacf(diff_series, nlags=n_lags, method='ols')
plt.figure(figsize=(20,10))
plt.subplot(121)
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='green')
plt.axhline(y=-1.96/np.sqrt(len(diff_series)),linestyle='--',color='green')
plt.axhline(y=1.96/np.sqrt(len(diff_series)),linestyle='--',color='green')
plt.title('Autocorrelation Function')
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='green')
plt.axhline(y=-1.96/np.sqrt(len(diff_series)),linestyle='--',color='green')
plt.axhline(y=1.96/np.sqrt(len(diff_series)),linestyle='--',color='green')
plt.title('Partial Autocorrelation Function')


## Wie verhält sich Wind, Windrichtung bei den unterschiedlichen Sturmfluten

In [None]:
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shapely.geometry
import xarray as xr
from joblib import Parallel, delayed
from scipy.interpolate import griddata
from tqdm import tqdm


# Funktion zur Landprüfung mit Cartopy
def is_on_land(lon, lat):
    land = cfeature.NaturalEarthFeature("physical", "land", "10m")
    for geom in land.geometries():
        if geom.contains(shapely.geometry.Point(lon, lat)):
            return True
    return False


# Funktion zum Erstellen der Landmaske
def create_land_mask(lon_grid, lat_grid):
    coords_list = [(lon, lat) for lat in lat_grid for lon in lon_grid]
    mask_flat = Parallel(n_jobs=-1)(
        delayed(lambda p: not is_on_land(*p))(p) for p in tqdm(coords_list)
    )
    return np.array(mask_flat).reshape(len(lat_grid), len(lon_grid))




In [None]:
import matplotlib.dates as mdates
import matplotlib.pyplot as plt


def lineplot_storm_surge(df, timepoints, ax=None):

    if ax is None:
        fig, ax = plt.subplots(figsize=(18, 6))

    timepoints_array = np.array(timepoints)
    timepoint_min = timepoints_array.min()
    timepoint_max = timepoints_array.max()
    # Eingrenzen des Datenbereichs
    df_plot = df.loc[
        (df_insitu["time"] >= timepoint_min - pd.Timedelta(hours=62)) & 
        (df_insitu["time"] <= timepoint_max + pd.Timedelta(hours=62))
    ].reset_index(drop=True)

    # Plot SLEV
    x = df_plot["time"]
    y = df_plot["slev"]
    ax.plot(x, y, label="SLEV", color="royalblue", linewidth=2)



    # VLines + Text
    for i, t in enumerate(timepoints):
        ax.axvline(x=t, ymin=0, ymax=1, color="crimson", linestyle="--", alpha=0.6)
        ax.text(
            t,
            y.mean(),
            t.strftime("%Y-%m-%d %H:%M"),
            color="crimson",
            fontsize=9,
            ha="center",
            rotation=90,
            rotation_mode="anchor"
        )
    # Format X-Achse
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d %H:%M"))
    ax.xaxis.set_major_locator(mdates.HourLocator(interval=4))
    plt.xticks(rotation=90)

    # Labels & Title
    ax.set_title("Sea Level Elevation (SLEV) with Storm Surges", fontsize=14, pad=15)
    #ax.set_xlabel("Time", fontsize=12)
    ax.set_ylabel("Water Level [m]", fontsize=12)

    # for t in timepoints:
    #     ax.axvline(x=t, ymin=0, ymax=1, color="crimson", linestyle="--", alpha=0.6, label="Storm Surge")

    # # Nur einmaliger Eintrag
    # handles, labels = ax.get_legend_handles_labels()
    # by_label = dict(zip(labels, handles))
    # ax.legend(by_label.values(), by_label.keys(), loc="upper left")


    # Grid, Legend, Layout
    ax.grid(True, linestyle="--", alpha=0.5)
    ax.legend(loc="upper left")
    #plt.tight_layout()

    #plt.show()
    return ax




In [None]:
closest_location = find_closest_location(df_ocean, target_lat, target_lon)
# Filter the data for the closest location
df_ocean_target = df_ocean[
    (df_ocean["latitude"] == closest_location["latitude"])
    & (df_ocean["longitude"] == closest_location["longitude"])
].reset_index(drop=True)

# Calculating Pearson correlation of SLEV and SLA
df_corr_ocean = df_ocean_target.copy()
df_corr_insitu = df_insitu.copy()

df_corr_ocean.index = pd.to_datetime(df_corr_ocean.index)
df_corr_insitu.index = pd.to_datetime(df_corr_insitu.index)
df_corr = pd.merge(df_corr_ocean[['sla']], df_corr_insitu[['slev']], left_index=True, right_index=True, how='inner')
corr = df_corr['slev'].corr(df_corr['sla'])
rmse = np.sqrt(np.mean((df_corr['slev'] - df_corr['sla']) ** 2))

In [None]:
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D


def grid_to_xarray(key, grid, lon_grid, lat_grid):
    ds = xr.Dataset(
            {
                f"{key}": (("latitude", "longitude"), grid),

            },
            coords={
                "latitude": lat_grid,
                "longitude": lon_grid,
            },
        )
    return ds 


def plot_for_timepoint(timepoint, ax=None, grid_size_ocean=50, wind_grid_size=20, vmin=-1.0, vmax=1.5, plot_water_velocity_data=True, plot_wind_data=True):
    
    if ax is None:
        fig, ax = plt.subplots(figsize=(12, 10))
    df_weather_time = df_weather[df_weather["time"] == timepoint]
    df_ocean_time = df_ocean[df_ocean["time"] == timepoint]

    # add rmse to df_ocean_time[sla]
    df_ocean_time["sla"] += rmse

    lon_grid = np.linspace(df_ocean_time["longitude"].min(), df_ocean_time["longitude"].max(), grid_size_ocean)
    lat_grid = np.linspace(df_ocean_time["latitude"].min(), df_ocean_time["latitude"].max(), grid_size_ocean)
    lon_mesh, lat_mesh = np.meshgrid(lon_grid, lat_grid)

    sla_grid = griddata(
        (df_ocean_time["longitude"], df_ocean_time["latitude"]),
        df_ocean_time["sla"],
        (lon_mesh, lat_mesh),
        method="linear",
    )

    m = Basemap(
        projection="cyl",
        resolution="i",
        llcrnrlon=lon_grid.min(),
        urcrnrlon=lon_grid.max(),
        llcrnrlat=lat_grid.min(),
        urcrnrlat=lat_grid.max(),
        ax=ax,
    )
    m.fillcontinents(color="grey", lake_color="white", alpha=0.5)
    m.drawcoastlines()
    m.drawcountries()

    # plot target point
    x_target, y_target = m(LON_FLENSBURG, LAT_FLENSBURG)  # Reihenfolge: (longitude, latitude)
    m.scatter(x_target, y_target, color="green", marker="*", label="Target Point", s=200) 


    # plot rectangle around target point
    sub_box = {
        "lat_min": 54.4, # 54.4
        "lat_max": 55.5, # 55.5
        "lon_min": 9.2,
        "lon_max": 10.5 # 10.5
        }
    x_box = [sub_box["lon_min"], sub_box["lon_max"], sub_box["lon_max"], sub_box["lon_min"], sub_box["lon_min"]]
    y_box = [sub_box["lat_min"], sub_box["lat_min"], sub_box["lat_max"], sub_box["lat_max"], sub_box["lat_min"]]
    x_box, y_box = m(x_box, y_box)
    ax.plot(x_box, y_box, color="green", linestyle="--", linewidth=2, label="Region of Interest")
    #ax.fill(x_box, y_box, color="white", alpha=0.2)


    # Create ocean grid
    mask = create_land_mask(lon_grid, lat_grid)
    sla_grid[~mask] = np.nan

    x_mesh, y_mesh = m(lon_mesh, lat_mesh)
    heatmap = m.pcolormesh(x_mesh, y_mesh, sla_grid, cmap="magma", shading="auto", vmin=vmin, vmax=vmax)


    # Wasser Geschwindigkeitsdaten
    # eastward and northward velocity
    if plot_water_velocity_data:
        water_uo = griddata(
            (df_ocean_time["longitude"], df_ocean_time["latitude"]),
            df_ocean_time["uo"],
            (lon_mesh, lat_mesh),
            method="linear",
        )
        water_vo = griddata(
            (df_ocean_time["longitude"], df_ocean_time["latitude"]),
            df_ocean_time["vo"],
            (lon_mesh, lat_mesh),
            method="linear",
        )

        # stride depends on the grid size
        if grid_size_ocean <= 100:
            stride = 1
        elif grid_size_ocean <= 200:
            stride = 4
        elif grid_size_ocean <= 300:
            stride = 6
        elif grid_size_ocean <= 400:
            stride = 8
        elif grid_size_ocean <= 500:
            stride = 12
        elif grid_size_ocean <= 600:
            stride = 16
        elif grid_size_ocean <= 700:
            stride = 20
        elif grid_size_ocean <= 800:
            stride = 24

        water_uo[~mask] = np.nan
        water_vo[~mask] = np.nan
        #stride = 2  # z.B. jeden 3. Punkt nehmen, anpassbar
        x_current = x_mesh[::stride, ::stride]
        y_current = y_mesh[::stride, ::stride]
        u_current = water_uo[::stride, ::stride]
        v_current = water_vo[::stride, ::stride]

        quiv_current = m.quiver(
            x_current,
            y_current,
            u_current,
            v_current,
            scale=20,        # je nach Einheiten der uo/vo anpassen
            color='grey',   # z.B. andere Farbe als Wind
            width=0.002,    # dünner Pfeil
            alpha=0.99,
            label="Current"
        )
        # ax.quiverkey(quiv_current, 0.92, 0.04, 20, '1 m/s Current', labelpos='E', 
        #             coordinates='axes', 
        #             color='grey')

        ds_ocean_uo = grid_to_xarray("vo", water_uo, lon_grid, lat_grid)
        ds_ocean_vo = grid_to_xarray("uo", water_vo, lon_grid, lat_grid)

        # merge the datasets
        ds_ocean_uo_vo= xr.merge([ds_ocean_uo, ds_ocean_vo])

        # calculate the actual velocity with time included
        ds_ocean_uo_vo['velocity'] = np.sqrt(ds_ocean_uo_vo["uo"]**2 + ds_ocean_uo_vo["vo"]**2)

        # select the ocean current at the bbox
        ds_ocean_uo_vo_bbox = ds_ocean_uo_vo.sel(
            latitude=slice(sub_box["lat_min"], sub_box["lat_max"]),
            longitude=slice(sub_box["lon_min"], sub_box["lon_max"]),
        )


        # get the max ocean current in the ds_ocean_uo_vo_bbox
        max_ocean_velocity = ds_ocean_uo_vo_bbox["velocity"].max().values

     # Winddaten   
    if plot_wind_data:
        
        lon_grid_wind = np.linspace(df_weather_time["longitude"].min(), df_weather_time["longitude"].max(), wind_grid_size)
        lat_grid_wind = np.linspace(df_weather_time["latitude"].min(), df_weather_time["latitude"].max(), wind_grid_size)
        lon_mesh_wind, lat_mesh_wind = np.meshgrid(lon_grid_wind, lat_grid_wind)

        wind_speed_grid = griddata(
            (df_weather_time["longitude"], df_weather_time["latitude"]),
            df_weather_time["wind_speed_10m"],
            (lon_mesh_wind, lat_mesh_wind),
            method="linear",
        )
        wind_dir_grid = griddata(
            (df_weather_time["longitude"], df_weather_time["latitude"]),
            df_weather_time["wind_direction_10m"],
            (lon_mesh_wind, lat_mesh_wind),
            method="linear",
        )

        u = wind_speed_grid * -np.cos(np.deg2rad(wind_dir_grid))
        v = wind_speed_grid * -np.sin(np.deg2rad(wind_dir_grid))
        x_wind, y_wind = m(lon_mesh_wind, lat_mesh_wind)
        quiv_wind = m.quiver(x_wind, y_wind, u, v, scale=1500, color="black")
        # Beispielhafte Quiverkey-Legende im Plot
        #ax.quiverkey(quiv_wind, 0.92, 0.08, 10, '10 m/s Wind', labelpos='E', coordinates='axes', color='black')


        # make xarray from wind_speed_grid and wind_dir_grid
        ds_wind_speed = grid_to_xarray("wind_speed", wind_speed_grid, lon_grid_wind, lat_grid_wind)
        
        # select the wind speed and direction at the bbox
        ds_wind_speed_bbox = ds_wind_speed.sel(
            latitude=slice(sub_box["lat_min"], sub_box["lat_max"]),
            longitude=slice(sub_box["lon_min"], sub_box["lon_max"]),
        )

        # get the max wind speed in the ds_wind_bbox
        max_wind_speed = ds_wind_speed_bbox["wind_speed"].max().values
        
        # text on top of the box
        # ax.text(
        #     (sub_box["lon_min"] + sub_box["lon_max"]) / 2,
        #     (sub_box["lat_max"] - 0.1),
        #     "Target Box",
        #     color="green",
        #     fontsize=11,
        #     ha="center",
        #     va="center",
        # )

    ax.set_title(f"Time: {pd.to_datetime(timepoint).strftime('%Y-%m-%d %H:%M')}\n Max Wind Speed at Target Box {max_wind_speed:.3f} km/h \n Max Water Velocity at Target Box {max_ocean_velocity:.3f} m/s", fontsize=14, pad=15)
    m.drawparallels(np.arange(0, 360, 2), labels=[1, 0, 0, 0])
    m.drawmeridians(np.arange(0, 350, 2), labels=[0, 0, 0, 1])
    
    return heatmap


def analyse_storm_surges(title: str = "Storm Surges Analysis", timepoints: list = None, grid_size_ocean=50, wind_grid_size=20, plot_water_velocity_data=True, plot_wind_data=True, save=False):
    
    fig = plt.figure(figsize=(20, 19))

    # Neue Anordnung: 3 Zeilen, 2 Spalten → Zeitreihe oben, darunter 2x2 Heatmaps
    gs = gridspec.GridSpec(3, 2, height_ratios=[0.4, 0.8, 1], hspace=0.2, wspace=0.15)

    heatmaps = []
    axes_heatmaps = []

    timepoints_array = np.array(timepoints)
    timepoint_min = timepoints_array.min()
    timepoint_max = timepoints_array.max()

    # Eingrenzen des Datenbereichs
    df_plot = df_ocean.loc[
        (df_ocean["time"] >= timepoint_min - pd.Timedelta(hours=62)) & 
        (df_ocean["time"] <= timepoint_max + pd.Timedelta(hours=62))
    ].reset_index(drop=True)

    # select the smallest sla and maximum sla
    min_sla = df_plot["sla"].min()
    max_sla = df_plot["sla"].max()
    print(f"Min SLA: {min_sla}, Max SLA: {max_sla}")

    # Zeitverlauf ganz oben über beide Spalten
    ax_line = fig.add_subplot(gs[0, :])  # oberste Zeile, beide Spalten
    lineplot_storm_surge(df_insitu, timepoints, ax=ax_line)
    ax_line.set_title("Time Graph of Water Level Elevation in Flensburg")
    ax_line.set_position([0.1, 0.75, 0.8, 0.2])  # Adjust size and position

    
    # 2x2 Heatmaps darunter
    min_sla_round = np.round(min_sla, 1)
    max_sla_round = np.round(max_sla, 1)
    for i, timepoint in enumerate(timepoints):
        ax = fig.add_subplot(gs[i // 2 + 1, i % 2])  # Zeilen 1 und 2
        
        ax.set_position([0.1 + (i % 2) * 0.4, 0.1 + (i // 2) * 0.23, 0.35, 0.35])  # Adjust size and position [left, bottom, width, height]
        axes_heatmaps.append(ax)
        heatmap = plot_for_timepoint(timepoint, ax, grid_size_ocean, wind_grid_size, min_sla_round, max_sla_round, plot_water_velocity_data=plot_water_velocity_data, plot_wind_data=plot_wind_data)
        heatmaps.append(heatmap)

    # Gemeinsame Farbleiste rechts neben den Heatmaps
    cbar_ax = fig.add_axes([0.87, 0.17, 0.02, 0.38])  # [left, bottom, width, height]
    cbar = fig.colorbar(heatmaps[0], cax=cbar_ax, orientation="vertical")
    ticks = np.linspace(min_sla_round, max_sla_round, num=5)  # Anzahl der Ticks anpassen
    cbar.set_ticks(ticks)
    cbar.set_label("Water Level (m)")

    legend_elements = []
    legend_elements.append(Line2D([0], [0], color='green', lw=2, label='Target Box', linestyle='--'))
    
    legend_elements.append(Line2D([0], [0], color='green', marker='*', markersize=15, linestyle='', label='Flensburg'))

    
    if plot_water_velocity_data:
        legend_elements.append(Line2D([0], [0], color='grey', lw=4, marker=r'$\rightarrow$', label='Ocean Current', linestyle=''))
    if plot_wind_data:
        legend_elements.append(Line2D([0], [0], color='black', lw=4, marker=r'$\rightarrow$', label='Wind Direction', linestyle=''))


    # Legende außerhalb der Plots, neben der Colorbar
    fig.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(0.85, 0.59), frameon=True, title="Vector Legend")

    # Save the figure
    if save:
        fig_name = f'{title}_{grid_size_ocean}ozean_{wind_grid_size}wind.png'
        fig.savefig(fig_name, bbox_inches='tight', dpi=300)

    fig.suptitle(title, fontsize=16, y=0.99)
    plt.show()


In [None]:
#plot_for_timepoint(timepoint="2023-01-01 00:00:00", ax=None, grid_size_ocean=50, wind_grid_size=20, vmin=-1.0, vmax=1.5, plot_water_velocity_data=True, plot_wind_data=True)

In [None]:
grid_size_ocean = 200
wind_grid_size = 25

In [None]:
# Analyse von 2023-10-19 bis 2023-10-21
timepoints = [
    pd.Timestamp("2023-10-20 21:00"), # links unten
    pd.Timestamp("2023-10-21 16:00"), # rechts unten
    pd.Timestamp("2023-10-19 04:00"), # links oben
    pd.Timestamp("2023-10-20 13:00"), # rechts oben
]

lineplot_storm_surge(df_insitu, timepoints, ax=ax_line)

analyse_storm_surges(
    title="Storm Surges Analysis from 2023-10-19 to 2023-10-21", 
    timepoints=timepoints, 
    grid_size_ocean=grid_size_ocean, 
    wind_grid_size=wind_grid_size,
    plot_water_velocity_data=True,
    plot_wind_data=True,
    save=True
    )

In [None]:
# Analyse von 2024-01-02 bis 2024-01-05
timepoints = [
    pd.Timestamp("2024-01-03 22:00"), # links unten
    pd.Timestamp("2024-01-05 05:00"), # rechts unten
    pd.Timestamp("2024-01-02 18:00"), # links oben
    pd.Timestamp("2024-01-03 01:00"), # rechts oben
]

analyse_storm_surges(
    title="Storm Surges Analysis from 2024-01-02 to 2024-01-05", 
    timepoints=timepoints, 
    grid_size_ocean=grid_size_ocean, 
    wind_grid_size=wind_grid_size,
    plot_water_velocity_data=True,
    plot_wind_data=True,
    save=True
    )

In [None]:
# Analyse von 2024-02-08 bis 2024-02-12
timepoints = [
    pd.Timestamp("2024-02-11 13:00"), # 3 links unten
    pd.Timestamp("2024-02-12 04:00"), # 4 rechts unten
    pd.Timestamp("2024-02-08 18:00"), # 1 links oben
    pd.Timestamp("2024-02-09 15:00"), # 2 rechts oben
]
# get smallest timepoint and maximum timepoint
timepoint_min = min(timepoints)
timepoint_max = max(timepoints)

analyse_storm_surges(
    title="Storm Surges Analysis from 2024-02-08 to 2024-02-12", 
    timepoints=timepoints, 
    grid_size_ocean=grid_size_ocean, 
    wind_grid_size=wind_grid_size,
    plot_water_velocity_data=True,
    plot_wind_data=True,
    save=True
    )

## Wie vehält sich Wassergeschwindigkeit, Richtung bei den unterschiedlichen Sturmfluten

In [None]:
# # Analyse von 2023-10-19 bis 2023-10-21
# timepoints = [
#     pd.Timestamp("2023-10-20 21:00"), # links unten
#     pd.Timestamp("2023-10-21 16:00"), # rechts unten
#     pd.Timestamp("2023-10-19 04:00"), # links oben
#     pd.Timestamp("2023-10-20 13:00"), # rechts oben
# ]

# analyse_storm_surges(
#     title="Storm Surges Analysis from 2023-10-19 to 2023-10-21", 
#     timepoints=timepoints, 
#     grid_size_ocean=grid_size_ocean, 
#     wind_grid_size=wind_grid_size,
#     plot_water_velocity_data=True,
#     plot_wind_data=False,
#     save=True
#     )

In [None]:
# # Analyse von 2024-01-02 bis 2024-01-05
# timepoints = [
#     pd.Timestamp("2024-01-03 22:00"), # links unten
#     pd.Timestamp("2024-01-05 05:00"), # rechts unten
#     pd.Timestamp("2024-01-02 18:00"), # links oben
#     pd.Timestamp("2024-01-03 01:00"), # rechts oben
# ]
# # 
# analyse_storm_surges(
#     title="Storm Surges Analysis from 2024-01-02 to 2024-01-05", 
#     timepoints=timepoints, 
#     grid_size_ocean=grid_size_ocean, 
#     wind_grid_size=wind_grid_size,
#     plot_water_velocity_data=True,
#     plot_wind_data=False,
#     save=True
#     )

In [None]:
# # Analyse von 2024-02-08 bis 2024-02-12
# timepoints = [
#     pd.Timestamp("2024-02-11 13:00"), # 3 links unten
#     pd.Timestamp("2024-02-12 04:00"), # 4 rechts unten
#     pd.Timestamp("2024-02-08 18:00"), # 1 links oben
#     pd.Timestamp("2024-02-09 15:00"), # 2 rechts oben
# ]

# analyse_storm_surges(
#     title="Storm Surges Analysis from 2024-02-08 to 2024-02-12", 
#     timepoints=timepoints, 
#     grid_size_ocean=grid_size_ocean, 
#     wind_grid_size=wind_grid_size,
#     plot_water_velocity_data=True,
#     plot_wind_data=False,
#     save=True
#     )

## Korrelation zwischen SLEV und features on map

In [None]:
df_ocean = load_ocean_data(ocean_data_path, OCEAN_POINTS, verbose=False)
df_ocean = process_df(df_ocean, drop_cols=["depth"], verbose=False)

df_weather = load_weather_data(weather_data_path, WEATHER_POINTS, verbose=False)
df_weather = process_df(df_weather, verbose=False)

df_insitu = load_insitu_data(verbose=False)
df_insitu = process_flensburg_data(df_insitu, 
                                      start_time=df_ocean['time'].min(),
                                      end_time=df_ocean['time'].max(),
                                      verbose=False)

df_insitu = group_data_hourly(df_insitu)
df_insitu = process_df(df_insitu, drop_cols=["deph"], verbose=False)


In [None]:


resolution = 0.25 # 0.25 degrees

def interpolate_xarray(ds, resolution=0.25, make_fine_grid=True, interpolate_nan=True):
    """
    Interpolates the xarray dataset to a finer grid.

    Parameters:
        ds (xarray.Dataset): The input dataset to interpolate.
        resolution (float): The desired resolution for the interpolation.

    Returns:
        xarray.Dataset: The interpolated dataset.
    """

    if interpolate_nan:
        # Interpolation von NaN-Werten
        ds = ds.interpolate_na(dim="time", method="linear")
        ds = ds.interpolate_na(dim="latitude", method="linear")
        ds = ds.interpolate_na(dim="longitude", method="linear")
    
    if make_fine_grid:
        # Neues feineres Gitter erzeugen
        new_lats = np.arange(ds.latitude.min(), ds.latitude.max(), resolution)
        new_lons = np.arange(ds.longitude.min(), ds.longitude.max(), resolution)

        # Interpolation
        ds = ds.interp(latitude=new_lats, longitude=new_lons, method="linear")
  
    return ds

# Create xarray datasets from DataFrames
# Interpolate the xarray to a higher resolution
ds_ocean = df_ocean.set_index(["time", "latitude", "longitude"]).to_xarray()
ds_ocean_interp = interpolate_xarray(ds_ocean, resolution=resolution, make_fine_grid=True, interpolate_nan=True)

ds_weather = df_weather.set_index(["time", "latitude", "longitude"]).to_xarray()
ds_weather_interp = interpolate_xarray(ds_weather, resolution=resolution, make_fine_grid=True, interpolate_nan=True)

# Aline the time axes
common_time = np.intersect1d(ds_ocean_interp.time.values, ds_weather_interp.time.values) # Finde common time points
ds_ocean_interp = ds_ocean_interp.sel(time=common_time)
ds_weather_interp = ds_weather_interp.sel(time=common_time)

# Aline the lat and lon axes
ds_weather_interp = ds_weather_interp.interp(
    latitude=ds_ocean_interp.latitude,
    longitude=ds_ocean_interp.longitude
)

ds_weather_interp = interpolate_xarray(ds_weather_interp, resolution=resolution, make_fine_grid=False, interpolate_nan=True)
ds_ocean_interp = interpolate_xarray(ds_ocean_interp, resolution=resolution, make_fine_grid=False, interpolate_nan=True)




def create_ocean_mask(ds, land):

    lon, lat = np.meshgrid(ds.longitude.values, ds.latitude.values)
    points = [shapely.geometry.Point(x, y) for x, y in zip(lon.flatten(), lat.flatten())]

    points_gdf = gpd.GeoDataFrame(geometry=points, crs=land.crs)
    joined = gpd.sjoin(points_gdf, land, predicate="within", how="left")
    on_land = ~joined.index_right.isna()

    mask_land = np.array(on_land).reshape(lat.shape)
    mask_ocean = ~mask_land

    ocean_mask_xr = xr.DataArray(
        mask_ocean,
        coords={"latitude": ds.latitude, "longitude": ds.longitude},
        dims=["latitude", "longitude"]
    )
    return ocean_mask_xr

land = gpd.read_file(geodatasets.get_path("naturalearth.land"))
ocean_mask = create_ocean_mask(ds_weather_interp, land)
ds_weather_ocean_only = ds_weather_interp.where(ocean_mask)
ds_ocean_ocean_only = ds_ocean_interp.where(ocean_mask)

ds_ocean_weather_interp = xr.merge([ds_ocean_ocean_only, ds_weather_interp])


In [None]:
ds_ocean_weather_interp

In [None]:


def calculate_correlation_temporal_spatial(ds, df_ref, variable='sla', start_date=None, end_date=None, ax=None, title=None):
    """
    Berechnet die Korrelation zwischen einer xarray-Zeitreihe und einer Pandas-Zeitreihe.
    
    Args:
        ds_ocean (xarray.Dataset): Das xarray-Dataset mit den Ozeandaten.
        df_ref (pandas.DataFrame): Das DataFrame mit den In-situ-Daten.
        variable (str): Der Name der Variablen im xarray-Dataset, die korreliert werden soll.
        
    Returns:
        correlations (numpy.ndarray): Ein Array mit den Korrelationswerten.
    """
    
    # 1. Sicherstellen, dass die Zeitstempel übereinstimmen
    # 2. Leeres Array für Korrelationen
    # 3. Schleife über alle Punkte
    # 4. Plotten der Korrelationen

    # 1. Deine Daten (angenommen du hast sie geladen als ds_ocean und df_ref)

    fig = None

    ocean_variables = ['bottomT', 'mlotst', 'siconc', 'sithick', 
                       'sla', 'so', 'sob', 'thetao', 'uo', 'vo', 'wo']
    
    air_variables = ['temperature_2m',
                    'relative_humidity_2m', 'dew_point_2m', 'apparent_temperature',
                    'precipitation', 'rain', 'showers', 'snowfall', 'weather_code',
                    'pressure_msl', 'surface_pressure', 'cloud_cover', 'cloud_cover_low',
                    'cloud_cover_mid', 'cloud_cover_high', 'et0_fao_evapotranspiration',
                    'vapour_pressure_deficit', 'wind_speed_10m', 'wind_direction_10m',
                    'wind_gusts_10m']
    



    df_ref_filtered = df_ref.copy()
    ds_filtered = ds.copy()

    if start_date is not None and end_date is not None:
        # Filtere die Daten nach dem angegebenen Zeitraum
        df_ref_filtered = df_ref_filtered[(df_ref_filtered['time'] >= start_date) & (df_ref_filtered['time'] <= end_date)]
        # Filtere die xarray-Daten nach dem angegebenen Zeitraum
        ds_filtered = ds_filtered.sel(time=slice(start_date, end_date))
    else:
        # Wenn kein Zeitraum angegeben ist, verwende die gesamte Zeitreihe
        start_date = ds_filtered['time'].min().values
        end_date = ds_filtered['time'].max().values

        # Filtere die xarray-Daten nach dem gesamten Zeitraum
    



    # 3. slev-Zeitreihe (von df_ref_filtered)
    slev_times = pd.to_datetime(df_ref_filtered['time'])
    slev_values = df_ref_filtered['slev'].values

    # 4. Sicherstellen, dass die Zeiten übereinstimmen
    # Zeitstempel vom xarray
    ocean_times = pd.to_datetime(ds_filtered['time'].values)

    # Indexe der gemeinsamen Zeiten
    common_times, idx_slev, idx_ocean = np.intersect1d(slev_times, ocean_times, return_indices=True)

    # neue Zeitreihen
    slev_values_common = slev_values[idx_slev]
    ocean_times_common = ocean_times[idx_ocean]

    # 5. Leeres Array für Korrelationen
    correlations = np.full((len(ds_filtered.latitude), len(ds_filtered.longitude)), np.nan)

    # 6. Schleife über alle Punkte
    for i, lat in enumerate(ds_filtered.latitude.values):
        for j, lon in enumerate(ds_filtered.longitude.values):
            # Zeitreihe an diesem Punkt
            ts = ds_filtered[variable].isel(latitude=i, longitude=j).values
            
            if np.all(np.isnan(ts)):  # Wenn nur NaNs -> überspringen
                continue
            
            # nur gemeinsame Zeiten auswählen
            ts_common = ts[idx_ocean]
            
            # Wenn zu viele NaNs, überspringen
            if np.isnan(ts_common).mean() > 0.3:  # z.B. >30% fehlende Werte
                continue
            
            # NaNs behandeln
            mask = ~np.isnan(ts_common) & ~np.isnan(slev_values_common)
            if np.sum(mask) < 10:  # Weniger als 10 gültige Werte
                continue
            
            # Korrelation berechnen
            corr = np.corrcoef(ts_common[mask], slev_values_common[mask])[0, 1]
            correlations[i, j] = corr

    # 7. Plotten der Korrelationen

    if ax is None:
        fig, ax = plt.subplots(figsize=(14, 12))
        created_fig = True
    else:
        created_fig = False


    m = Basemap(projection='cyl', resolution='i',
                llcrnrlon=ds_filtered.longitude.min(), urcrnrlon=ds_filtered.longitude.max(),
                llcrnrlat=ds_filtered.latitude.min(), urcrnrlat=ds_filtered.latitude.max(), ax=ax)

    if variable in air_variables:
        m.drawcoastlines(ax=ax)
        m.drawcountries(ax=ax)
        m.drawmapboundary(fill_color='white', ax=ax)
        m.fillcontinents(color='lightgrey', lake_color='white', ax=ax)
        heatmap = ax.pcolormesh(ds_filtered.longitude, ds_filtered.latitude, correlations, cmap='coolwarm', vmin=-1, vmax=1)
        plt.colorbar(heatmap, label='Pearson correlation coefficient', orientation='horizontal', pad=0.05, ax=ax)
        ax.scatter(df_ref_filtered['longitude'], df_ref_filtered['latitude'], c='green', s=200, label='Flensburg', marker='*')

    m.drawparallels(np.arange(-360., 360, 1.), labels=[1, 0, 0, 0])
    m.drawmeridians(np.arange(-360., 360, 1.), labels=[0, 0, 0, 1])

    



    if variable in ocean_variables:
        heatmap = ax.pcolormesh(ds_filtered.longitude, ds_filtered.latitude, correlations, cmap='coolwarm', vmin=-1, vmax=1)
        plt.colorbar(heatmap, label='Pearson correlation coefficient', orientation='horizontal', pad=0.05, ax=ax)
        m.drawcoastlines(ax=ax)
        m.drawcountries(ax=ax)
        m.drawmapboundary(fill_color='white', ax=ax)
        m.fillcontinents(color='lightgrey', lake_color='white', ax=ax)
        ax.scatter(df_ref_filtered['longitude'], df_ref_filtered['latitude'], c='green', s=200, label='Flensburg', marker='*') 
    
    # Set title to describe the variable
    if variable in WEATHER_DICT.keys():
        var_name = WEATHER_DICT[variable]['description'].lower()
    if variable in OCEAN_DICT.keys():
        var_name = OCEAN_DICT[variable]['description'].lower()
    else:
        var_name = variable
    
    start_date_str = pd.to_datetime(start_date).strftime('%Y-%m-%d %H:%M')
    end_date_str = pd.to_datetime(end_date).strftime('%Y-%m-%d %H:%M')
    if title is not None:

        plt.title(title, fontsize=10, pad=15)
    else:
        plt.title(f'{start_date_str} to {end_date_str}', fontsize=10, pad=15)
        
    plt.legend(loc='upper right')
    plt.tight_layout()
    
    if created_fig:
        plt.show()


    return heatmap





In [None]:
variables = ['sla', 'uo', 'vo', 'wind_speed_10m', 'wind_direction_10m', 'surface_pressure']
#variables = ds_ocean_weather_interp.data_vars.keys()
#variables = ['wind_speed_10m']

for variable in variables:
    print(f"Calculating correlation for {variable}")
    calculate_correlation_temporal_spatial(ds_ocean_weather_interp, df_insitu, variable=variable, 
                                           #start_date="2023-10-19", end_date="2023-10-21"
                                           )
    break


In [None]:


start_date = ds_ocean_weather_interp['time'].min().values
end_date = ds_ocean_weather_interp['time'].max().values

# round datetime to YYYY-MM-DD HH:MM
start_date = pd.to_datetime(start_date).strftime('%Y-%m-%d %H:%M')
end_date = pd.to_datetime(end_date).strftime('%Y-%m-%d %H:%M')

variables = ['sla', 'uo', 'vo', 'wind_speed_10m', 'wind_direction_10m', 'surface_pressure']
variables

def plot_correlations(variables, start_date, end_date):
    """
    Plots the correlations between ocean variables and Flensburg SLEV.
    
    Args:
        variables (list): List of variable names to plot.
        start_date (str): Start date for the analysis.
        end_date (str): End date for the analysis.
    """
    n_cols = 3
    n_rows = (len(variables) + n_cols - 1) // n_cols

    fig = plt.figure(figsize=(8 * n_cols, 5 * n_rows))
    gs = gridspec.GridSpec(n_rows, n_cols, figure=fig)

    fig.suptitle(f"Correlation between Ocean Variables and Flensburg SLEV from {start_date} to {end_date}", fontsize=20, y=0.98)

    for idx, variable in enumerate(variables):
        print(f"Calculating correlation for {variable}")
        ax = fig.add_subplot(gs[idx])
        calculate_correlation_temporal_spatial(
            ds_ocean_weather_interp,
            df_insitu,
            variable=variable,
            ax=ax,
            start_date=start_date,
            end_date=end_date,
        )

    # Statt tight_layout --> subplots_adjust
    fig.subplots_adjust(top=0.92, hspace=0.1, wspace=0.1)  # <-- manuell fein justieren!
    plt.show()





def compare_correlations(timepoints:list, variable:str):
    
    
    n_cols = 3
    n_rows = (len(timepoints) + n_cols -1) // n_cols
    fig = plt.figure(figsize=(8 * n_cols, 5 * n_rows))
    gs = gridspec.GridSpec(n_rows, n_cols, figure=fig)

    if variable in WEATHER_DICT.keys():
        var_name = WEATHER_DICT[variable]['description'].lower()
        var_explaination = WEATHER_DICT[variable]['explanation']
        #var_unit = WEATHER_DICT[variable]['unit']
    if variable in OCEAN_DICT.keys():
        var_name = OCEAN_DICT[variable]['description'].lower()
        var_explaination = OCEAN_DICT[variable]['explanation']
        #var_unit = WEATHER_DICT[variable]['unit']
    else:
        var_name = variable
        var_explaination = ""
        #var_unit = ""
    fig.suptitle(f"Correlation between Flensburg SLEV and {var_name} \n{var_explaination}", fontsize=20, y=0.98)
    

    for idx, timepoint in tqdm(enumerate(timepoints), total=len(timepoints)):

        start_date = timepoint - pd.Timedelta(days=3)
        end_date = timepoint + pd.Timedelta(days=3)
        ax = fig.add_subplot(gs[idx])
        calculate_correlation_temporal_spatial(
            ds_ocean_weather_interp,
            df_insitu,
            variable=variable,
            ax=ax,
            start_date=start_date,
            end_date=end_date
        )
    fig.subplots_adjust(top=0.92, hspace=0.1, wspace=0.1)  # <-- manuell fein justieren!
    plt.show()



In [None]:
import datetime
timepoints_all = sorted(set(df_ocean["time"]) & set(df_weather["time"]))

sturm_surge_list = [datetime.datetime(2023, 2, 25, 17, 0),
                datetime.datetime(2023, 4, 1, 12, 0),
                datetime.datetime(2023, 10, 7, 20, 0),
                datetime.datetime(2023, 10, 20, 0, 0),
                datetime.datetime(2024, 1, 3, 9, 0),
                datetime.datetime(2024, 2, 9, 18, 0),
                datetime.datetime(2024, 12, 9, 16, 0),
                ]
variables = ds_ocean_weather_interp.data_vars.keys()
for variable in variables:
    print(f"Calculating correlation for {variable}")
    compare_correlations(timepoints=sturm_surge_list, variable=variable)



In [None]:
start_date = ds_ocean_weather_interp['time'].min().values
end_date = ds_ocean_weather_interp['time'].max().values

# round datetime to YYYY-MM-DD HH:MM
start_date = pd.to_datetime(start_date).strftime('%Y-%m-%d %H:%M')
end_date = pd.to_datetime(end_date).strftime('%Y-%m-%d %H:%M')

variables = ds_ocean_weather_interp.data_vars.keys()

plot_correlations(variables, start_date, end_date)

In [None]:
import datetime

start_date = datetime.datetime(2023, 10, 17, 0, 0).strftime("%Y-%m-%d %H:%M")
end_date = datetime.datetime(2023, 10, 23, 0, 0).strftime("%Y-%m-%d %H:%M")

variables = ds_ocean_weather_interp.data_vars.keys()

plot_water_level_anomalies(df_insitu, start_date=start_date, end_date=end_date)
plot_correlations(
    variables,
    start_date=start_date,
    end_date=end_date
)

In [None]:
import datetime

sturm_surge_list = [datetime.datetime(2023, 2, 25, 17, 0),
                    datetime.datetime(2023, 4, 1, 12, 0),
                    datetime.datetime(2023, 10, 7, 20, 0),
                    datetime.datetime(2023, 10, 20, 0, 0),
                    datetime.datetime(2024, 2, 9, 18, 0),
                    datetime.datetime(2024, 12, 9, 16, 0),
                    ]

for time in sturm_surge_list:
    start_time = time - datetime.timedelta(days=1)
    end_time = time + datetime.timedelta(days=1)
    df_insitu_sturm = df_insitu[(df_insitu["time"] >= start_time) & (df_insitu["time"] <= end_time)]
    plot_water_level_anomalies(df_insitu_sturm, start_date=start_date, end_date=end_date)
    plot_correlations(
        variables,
        start_date=start_time,
        end_date=end_time
    )
    

In [None]:
## Correlaion just at sturm surges 

df_surge = df_insitu.copy()
df_surge = df_surge.loc[df_surge['slev'] >= 0.5]
start_date = df_surge['time'].min()
end_date = df_surge['time'].max()
# round datetime to YYYY-MM-DD HH:MM
start_date = pd.to_datetime(start_date).strftime('%Y-%m-%d %H:%M')
end_date = pd.to_datetime(end_date).strftime('%Y-%m-%d %H:%M')
variables = ds_ocean_weather_interp.data_vars.keys()
plot_water_level_anomalies(df_surge, start_date=start_date, end_date=end_date)
plot_correlations(
    variables,
    start_date=start_date,
    end_date=end_date
)


In [None]:
df_weather.showers

## Lassen sich Korrelationen zwischen den Features und dem Wasserpegel (sla) erkennen?

In [None]:
df_ocean = load_ocean_data(ocean_data_path, OCEAN_POINTS, verbose=False)
df_ocean = process_df(df_ocean, drop_cols=["depth"], verbose=False)

df_weather = load_weather_data(weather_data_path, WEATHER_POINTS, verbose=False)
df_weather = process_df(df_weather, verbose=False)

df_insitu = load_insitu_data(verbose=False)
df_insitu = process_flensburg_data(df_insitu, 
                                      start_time=df_ocean['time'].min(),
                                      end_time=df_ocean['time'].max(),
                                      verbose=False)

df_insitu = group_data_hourly(df_insitu)
df_insitu = process_df(df_insitu, drop_cols=["deph"], verbose=False)


In [None]:
import geodatasets
import geopandas as gpd
import numpy as np
import xarray as xr

resolution = 0.25 # 0.25 degrees

def interpolate_xarray(ds, resolution=0.25, make_fine_grid=True, interpolate_nan=True):
    """
    Interpolates the xarray dataset to a finer grid.

    Parameters:
        ds (xarray.Dataset): The input dataset to interpolate.
        resolution (float): The desired resolution for the interpolation.

    Returns:
        xarray.Dataset: The interpolated dataset.
    """

    if interpolate_nan:
        # Interpolation von NaN-Werten
        ds = ds.interpolate_na(dim="time", method="linear")
        ds = ds.interpolate_na(dim="latitude", method="linear")
        ds = ds.interpolate_na(dim="longitude", method="linear")
    
    if make_fine_grid:
        # Neues feineres Gitter erzeugen
        new_lats = np.arange(ds.latitude.min(), ds.latitude.max(), resolution)
        new_lons = np.arange(ds.longitude.min(), ds.longitude.max(), resolution)

        # Interpolation
        ds = ds.interp(latitude=new_lats, longitude=new_lons, method="linear")
  
    return ds

# Create xarray datasets from DataFrames
# Interpolate the xarray to a higher resolution
ds_ocean = df_ocean.set_index(["time", "latitude", "longitude"]).to_xarray()
ds_ocean_interp = interpolate_xarray(ds_ocean, resolution=resolution, make_fine_grid=True, interpolate_nan=True)

ds_weather = df_weather.set_index(["time", "latitude", "longitude"]).to_xarray()
ds_weather_interp = interpolate_xarray(ds_weather, resolution=resolution, make_fine_grid=True, interpolate_nan=True)

# Aline the time axes
common_time = np.intersect1d(ds_ocean_interp.time.values, ds_weather_interp.time.values) # Finde common time points
ds_ocean_interp = ds_ocean_interp.sel(time=common_time)
ds_weather_interp = ds_weather_interp.sel(time=common_time)

# Aline the lat and lon axes
ds_weather_interp = ds_weather_interp.interp(
    latitude=ds_ocean_interp.latitude,
    longitude=ds_ocean_interp.longitude
)

ds_weather_interp = interpolate_xarray(ds_weather_interp, resolution=resolution, make_fine_grid=False, interpolate_nan=True)
ds_ocean_interp = interpolate_xarray(ds_ocean_interp, resolution=resolution, make_fine_grid=False, interpolate_nan=True)




def create_ocean_mask(ds, land):

    lon, lat = np.meshgrid(ds.longitude.values, ds.latitude.values)
    points = [shapely.geometry.Point(x, y) for x, y in zip(lon.flatten(), lat.flatten())]

    points_gdf = gpd.GeoDataFrame(geometry=points, crs=land.crs)
    joined = gpd.sjoin(points_gdf, land, predicate="within", how="left")
    on_land = ~joined.index_right.isna()

    mask_land = np.array(on_land).reshape(lat.shape)
    mask_ocean = ~mask_land

    ocean_mask_xr = xr.DataArray(
        mask_ocean,
        coords={"latitude": ds.latitude, "longitude": ds.longitude},
        dims=["latitude", "longitude"]
    )
    return ocean_mask_xr

land = gpd.read_file(geodatasets.get_path("naturalearth.land"))
ocean_mask = create_ocean_mask(ds_weather_interp, land)
ds_weather_ocean_only = ds_weather_interp.where(ocean_mask)
ds_ocean_ocean_only = ds_ocean_interp.where(ocean_mask)

ds_ocean_weather_interp = xr.merge([ds_ocean_ocean_only, ds_weather_interp])


---

### Calculating Bivariate Moran’s I 

**Erklärung:** 

**Abgrenzung zu Korrelation und zeitlicher Autokorrelation**
Bei der Berechnung der statistischen Korrelation werden zwei Variablen (x,y) bei zwei oder mehr Beobachtungen betrachtet; bei der räumlichen Autokorrelation hingegen eine Variable x an zwei oder mehr Orten.[3]

Während die zeitliche Autokorrelation die Beziehungen der Ausprägungen einer Variablen mit sich selbst über die Zeit beschreibt, beschreibt die räumliche Autokorrelation die Ausprägungen einer Variablen mit sich selbst im Raum. 

**Berechnung**
*Positive räumliche Autokorrelation *liegt dann vor, wenn nahe beieinander liegende Orte einander mit höherer Wahrscheinlichkeit ähnlich sind als weiter voneinander entfernte Orte. Das heißt: Positive räumliche Autokorrelation liegt vor, wenn Orte dazu tendieren, im Hinblick auf eine Eigenschaft Cluster zu bilden. Positive räumliche Autokorrelation ist eine empirische Manifestation von [Toblers](https://de.wikipedia.org/wiki/Erstes_Gesetz_der_Geographie) „Erstem Gesetz der Geographie“.

*Negative räumliche Autokorrelation* liegt dann vor, wenn benachbarte Orte im Vergleich zu zufälliger Anordnung[5] unterschiedliche Eigenschaftswerte aufweisen. Bei Phänomenen, die mit Lebewesen (Tieren, Pflanzen) verbunden sind, wird negative Autokorrelation häufig durch Wettbewerb und Verdrängung verursacht.
d
*Keine räumliche Autokorrelation* liegt vor, wenn die Orte im Hinblick auf eine Eigenschaft zufällig angeordnet sind, also keine ausgeprägten Cluster aufweisen.

[Wikipedia](https://de.wikipedia.org/wiki/Räumliche_Autokorrelation?utm_source=chatgpt.com) 

**Was ist räumliche Autokorrelation (spatial autocorrelatio**n)?

Räumliche Autokorrelation beschreibt, wie stark der Wert einer Variablen an einem Ort mit den Werten derselben Variablen an anderen Orten zusammenhängt.

**Vergleich mit klassischer Korrelation:**
Normale (statistische) Korrelation:
→ Fragt: "Wie hängen zwei verschiedene Variablen miteinander zusammen?"
Beispiel: Wenn der Luftdruck steigt, sinkt vielleicht die Regenwahrscheinlichkeit.
Räumliche Autokorrelation:
→ Fragt: "Wie hängt der Wert einer einzigen Variable an einem Ort mit den Werten derselben Variable an benachbarten Orten zusammen?"
Beispiel: Ist die Wasserhöhe an Punkt A ähnlich wie an den umliegenden Punkten?

**Und wie unterscheidet sich das von zeitlicher Autokorrelation?**
Zeitliche Autokorrelation:
→ Fragt: "Wie hängt ein Wert heute mit den Werten derselben Variablen in der Vergangenheit oder Zukunft zusammen?"
Beispiel: Die Temperatur heute hängt oft mit der Temperatur gestern zusammen.
Räumliche Autokorrelation:
→ Fragt: "Wie hängt der Wert an einem Ort mit Werten an anderen Orten zusammen?"

**Achtung!!**
Verwende den bivariaten Moran’s I, wenn du den räumlichen Zusammenhang zwischen zwei Variablen in geografischen Daten untersuchen möchtest. Dieser berücksichtigt, wie benachbarte Gebiete miteinander korrelieren und wie sich räumliche Muster bilden.
Beispiel: Du analysierst, ob hohe Wasserhöhen in einer Region mit hohen Wasserhöhen in den benachbarten Regionen zusammenhängen und ob der Luftdruck in benachbarten Regionen ebenfalls ähnliche Werte aufweist.
Verwende den Pearson’s R, wenn du den linearen Zusammenhang zwischen zwei Variablen messen möchtest, ohne die räumliche Abhängigkeit zu berücksichtigen. Dies ist besonders nützlich, wenn du den direkten Zusammenhang zwischen zwei Variablen untersuchen möchtest, ohne auf ihre geografische Lage zu achten.
Beispiel: Du möchtest wissen, ob es einen linearen Zusammenhang zwischen der Wasserhöhe und dem Luftdruck gibt, unabhängig von den geografischen Standorten der Daten.





In [None]:
import geopandas as gpd

#from splot.esda import moran_bv_plot
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
from esda.moran import Moran_BV
from libpysal.weights import KNN
from shapely.geometry import Point
from tqdm import tqdm  # Fortschrittsbalken für test_k


def calculating_morans_I(ds, var1: str, var2: str, time: str, k=3, plot=False, test_k=False):
    # 1. Zeitschritt auswählen
    ds = ds.sel(time=time)

    # 2. Koordinaten extrahieren
    lat, lon = np.meshgrid(ds.latitude.values, ds.longitude.values, indexing='ij')
    coords = np.column_stack([lat.ravel(), lon.ravel()])
    gdf = gpd.GeoDataFrame({
        var1: ds[var1].values.ravel(),
        var2: ds[var2].values.ravel()
    }, geometry=[Point(xy) for xy in coords])

    # 3. NaNs entfernen
    gdf = gdf.dropna(subset=[var1, var2]).reset_index(drop=True)

    # 4. KNN Spatial Weights
    if len(gdf) <= k:
        raise ValueError(f"k={k} is too large for the dataset size of {len(gdf)}")
    w = KNN.from_dataframe(gdf, k=k)
    w.transform = "r"

    # 5. Moran's I berechnen
    x = gdf[var1]
    y = gdf[var2]
    moran_bv = Moran_BV(x, y, w)

    # 6. Optional: K-Test
    if test_k:
        list_k = np.arange(3, min(50, len(gdf)), 3)
        results = []

        for k_ in tqdm(list_k, desc="Testing k values"):
            if len(gdf) <= k_:
                continue
            try:
                w_ = KNN.from_dataframe(gdf, k=k_)
                w_.transform = "r"
                m_bv = Moran_BV(x, y, w_)
                results.append({"k": k_, "Moran's I": m_bv.I, "p-value": m_bv.p_sim})
            except Exception as e:
                print(f"Fehler bei k={k_}: {e}")

        df_results = pd.DataFrame(results)
        print(df_results)




    # 7. Optional: Plot
    if plot:
        print(f"Bivariate Moran’s I: {moran_bv.I:.4f}")
        print(f"P-Wert (Monte Carlo): {moran_bv.p_sim:.4f}")

        z_x = (x - x.mean()) / x.std()
        z_y = (y - y.mean()) / y.std()
        wz_y = w.sparse @ z_y

        plt.figure(figsize=(8, 6))
        plt.scatter(z_x, wz_y, alpha=0.5)
        plt.axhline(0, color='red', linestyle='--')
        plt.axvline(0, color='red', linestyle='--')
        plt.title("Bivariate Moran's I Scatter Plot")
        plt.xlabel("Variable X (Standardized)")
        plt.ylabel("Spatial Lag of Y (Standardized)")
        plt.grid(True, linestyle='--', alpha=0.3)
        plt.tight_layout()
        plt.show()

    return moran_bv.I, moran_bv.p_sim


# Usage
time = "2024-10-20 19:00"
var1 = "sla"
var2 = "pressure_msl"

morans_I, morans_p = calculating_morans_I(ds_ocean_weather_interp, var1=var1, var2=var2, time=time, k=8, plot=False, test_k=True)





In [None]:
# make list of times between 2024-01-02 18:00 and 2024-01-05 05:00
start = df_ocean["time"].min()
end = df_ocean["time"].max()

start = "2023-10-01 00:00"
end = "2023-11-01 00:00"
timepoints = pd.date_range(start=start, end=end, freq="1h")



var1 = "sla"
var2 = "wind_speed_10m"

list_morans_I = []
list_morans_p = []
for time in timepoints:
    #print(f"Calculating Moran's I for time: {time}")
    # Calculate Moran's I for each timepoint
    morans_I, morans_p = calculating_morans_I(ds_ocean_weather_interp, var1=var1, var2=var2, time=time, k=5, plot=False)
    list_morans_I.append(morans_I)
    list_morans_p.append(morans_p)

array_morans_I = np.array(list_morans_I)
array_morans_p = np.array(list_morans_p)

# Plotting the results
plt.figure(figsize=(12, 6))
plt.plot(timepoints, array_morans_I, label="Moran's I", color="blue")
plt.axhline(0, color="red", linestyle="--")
plt.title("Moran's I over Time")
plt.xlabel("Time")
plt.ylabel("Moran's I")
plt.xticks(rotation=45)
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()


Ziel: Berechnet die Pearson-Korrelation zwischen zwei Xarray-Daten (x und y), wobei dim='time' spezifiziert, dass die Korrelation entlang der Zeitdimension durchgeführt wird.

xr.corr: Dies ist eine Funktion von Xarray, die den Pearson-Korrelationskoeffizienten zwischen zwei Variablen berechnet, indem sie ihre Werte entlang einer bestimmten Dimension (in diesem Fall time) vergleicht.

x und y sind die beiden Xarray-Datenarrays (z. B. Zeitserien von sla und pressure_msl).
dim='time' bedeutet, dass die Korrelation für jede räumliche Position über die Zeit hinweg berechnet wird. Zum Beispiel, wenn du Zeitserien für verschiedene geografische Punkte hast, wird der Korrelationswert für jeden Punkt berechnet.

In [None]:
import cartopy.crs as ccrs
import matplotlib.pyplot as plt

# Select subset of data from start_time to end_time
# start_time = "2023-01-19"
# end_time = "2023-10-21"

# Select all times in df_ocean where df_insitu[slev] larger then 1.0
df_stormsurge = df_insitu[df_insitu["slev"] > 1.0]
# Select the timepoints from df_stormsurge
timepoints = df_stormsurge["time"].values

ds_ocean_weather_interp_sub = ds_ocean_weather_interp.sel(time=timepoints)


#time_string = f"{start_time.strftime('%Y-%m-%d')} to {end_time.strftime('%Y-%m-%d')}"
time_string = f'Correlation of all timpoints if sturm surges'

#ds_ocean_weather_interp_sub = ds_ocean_weather_interp.sel(time=slice(start_time, end_time))

def pearson_r(x, y):
    return xr.corr(x, y, dim='time')

def plot_correlation(x, y, title="Correlation Map (Contour)"):
    """
    Plots the correlation map as a contour plot using Cartopy.
    
    Parameters:
        x, y (xarray.DataArray): Variables to compute correlation from.
        title (str): Title of the plot.
    """
    correlation_map = pearson_r(x, y)

    fig = plt.figure(figsize=(10, 6))
    ax = plt.axes(projection=ccrs.PlateCarree())

    # Konturplot
    correlation_map.plot.contourf(
        ax=ax,
        transform=ccrs.PlateCarree(),
        cmap='coolwarm', # 
        levels=50,  # Optional: Anzahl der Konturlinien
        vmin=-1,
        vmax=1,
        cbar_kwargs={'label': 'Pearson Correlation Coefficient','orientation': 'horizontal', 'shrink': 0.8, 'pad': 0.05},

    )

    ax.coastlines()
    ax.add_feature(cfeature.BORDERS)
    # colors to land
    ax.add_feature(cfeature.LAND, facecolor='lightgrey', alpha=0.9)
    # add lat and lon gridlines
    ax.gridlines(draw_labels=True, linewidth=0.5, color='gray', alpha=0.5, linestyle='--')
    ax.set_title(title)
    plt.show()

# Ignore RuntimeWarning
warnings.filterwarnings("ignore", category=RuntimeWarning)

plot_correlation(ds_ocean_weather_interp_sub['sla'], ds_ocean_weather_interp_sub['pressure_msl'], title=f"Correlation between SLA and Pressure from {time_string}")


In [None]:
def plot_correlation_with_currents(x, y, u, v, title="Correlation Map (Contour + Currents)"):
    """
    Plots the correlation map as a contour plot with ocean current arrows using Cartopy.

    Parameters:
        x, y (xarray.DataArray): Variables to compute correlation from.
        u, v (xarray.DataArray): Zonal (uo) and meridional (vo) current components.
        title (str): Title of the plot.
    """
    correlation_map = pearson_r(x, y)

    fig = plt.figure(figsize=(12, 6))
    ax = plt.axes(projection=ccrs.PlateCarree())

    # Konturplot
    correlation_map.plot.contourf(
        ax=ax,
        transform=ccrs.PlateCarree(),
        cmap='coolwarm',
        levels=21,
        vmin=-1,
        vmax=1,
        cbar_kwargs={'label': 'Pearson Correlation Coefficient',
                     'orientation': 'horizontal', 'shrink': 0.8, 'pad': 0.05},
    )

    # Subsampling für bessere Übersicht
    step = 1
    u_sub = u.isel(latitude=slice(None, None, step), longitude=slice(None, None, step))
    v_sub = v.isel(latitude=slice(None, None, step), longitude=slice(None, None, step))

    # Gitterkoordinaten extrahieren
    lat_sub = u_sub.latitude.values
    lon_sub = u_sub.longitude.values
    lon2d, lat2d = np.meshgrid(lon_sub, lat_sub)

    # Quiver-Pfeile plotten (auf 2D-Arrays achten!)
    ax.quiver(
        lon2d,
        lat2d,
        u_sub.values,
        v_sub.values,
        transform=ccrs.PlateCarree(),
        scale=10,  # Anpassen nach Daten
        width=0.002,
        color='black'
    )

    # Zusätzliche Kartenfeatures
    ax.coastlines()
    ax.add_feature(cfeature.BORDERS)
    ax.add_feature(cfeature.LAND, facecolor='lightgrey', alpha=0.9)
    ax.gridlines(draw_labels=True, linewidth=0.5, color='gray', alpha=0.5, linestyle='--')
    ax.set_title(title)
    plt.show()

u_mean = ds_ocean_weather_interp_sub['uo'].mean(dim='time')
v_mean = ds_ocean_weather_interp_sub['vo'].mean(dim='time')


plot_correlation_with_currents(
    ds_ocean_weather_interp_sub['sla'],
    ds_ocean_weather_interp_sub['pressure_msl'],
    u_mean,
    v_mean,
    title=f"Correlation between SLA and Pressure with Currents from {time_string}"
)

In [None]:
import numpy as np
import xarray as xr


def select_nearest_valid_point(ds, variable_name, target_lat, target_lon):
    """
    Wähle den nächstgelegenen gültigen Punkt (nicht-NaN) im Dataset für eine bestimmte Variable.
    
    Parameters:
        ds (xr.Dataset): Das Eingabe-Dataset mit latitude, longitude, und time-Dimensionen.
        variable_name (str): Name der Variable zur Prüfung auf Gültigkeit (z.B. 'wind_speed_10m').
        target_lat (float): Ziel-Breitengrad.
        target_lon (float): Ziel-Längengrad.

    Returns:
        xr.Dataset: Subset des ursprünglichen Datasets an der nächsten gültigen Position.
        float: Breitengrad der gültigen Position.
        float: Längengrad der gültigen Position.
    """
    # Maske gültiger Punkte entlang der Zeitachse
    valid_mask = ds[variable_name].notnull().any(dim='time')

    # 2D-Gitter der Koordinaten
    lat2d, lon2d = np.meshgrid(ds.latitude.values, ds.longitude.values, indexing='ij')

    # Nur gültige Koordinaten extrahieren
    valid_lat_points = lat2d[valid_mask.values]
    valid_lon_points = lon2d[valid_mask.values]

    # Distanzberechnung (euklidisch)
    distances = np.sqrt((valid_lat_points - target_lat)**2 + (valid_lon_points - target_lon)**2)

    # Index des nächsten gültigen Punkts
    min_idx = np.argmin(distances)
    nearest_lat = valid_lat_points[min_idx]
    nearest_lon = valid_lon_points[min_idx]

    print(f"Nächstgelegener gültiger Punkt: lat={nearest_lat:.6f}, lon={nearest_lon:.6f}")
    
    return ds.sel(latitude=nearest_lat, longitude=nearest_lon), nearest_lat, nearest_lon



In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xarray as xr
from mpl_toolkits.basemap import Basemap
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

# === 1. In-situ Daten laden ===
# def load_insitu_data(filepath):
#     ds = xr.open_dataset(filepath)
#     df = ds.to_dataframe().reset_index()
#     return df.rename(columns={
#         "TIME": "time",
#         "SLEV": "slev",
#         "LATITUDE": "latitude",
#         "LONGITUDE": "longitude"
#     })

df_insitu = load_insitu_data()
df_insitu = flensburg_data_processing(df_insitu)
df_insitu = interpolate_missing_times(df_insitu)

insitu_location = (df_insitu.latitude.iloc[0], df_insitu.longitude.iloc[0])

# Get the nearest valid point in xarray to Flensburg
target_lat = 54.5
target_lon = 10.0
flensburg_ds, nearest_lat, nearest_lon = select_nearest_valid_point(ds_ocean_weather_interp_sub, 'wind_speed_10m', target_lat, target_lon)



# === 2. Wetterdaten vorbereiten ===
# Stelle sicher, dass flensburg_ds vorher korrekt definiert ist
wetter_df = flensburg_ds.to_dataframe().reset_index().set_index("time")
wetter_location = (wetter_df.latitude.iloc[0], wetter_df.longitude.iloc[0])


# === 3. Karte mit Basemap zeichnen ===
def plot_locations(insitu_loc, weather_loc):
    center_lat = (insitu_loc[0] + weather_loc[0]) / 2
    center_lon = (insitu_loc[1] + weather_loc[1]) / 2

    plt.figure(figsize=(10, 8))
    m = Basemap(projection='merc',
                llcrnrlat=center_lat - 2, urcrnrlat=center_lat + 2,
                llcrnrlon=center_lon - 2, urcrnrlon=center_lon + 2,
                resolution='i')

    m.drawcoastlines()
    m.drawcountries()
    m.drawmapboundary(fill_color='white')
    m.fillcontinents(color='lightgray', lake_color='white')
    m.drawparallels(np.arange(0., 90., 0.5), labels=[1,0,0,0])
    m.drawmeridians(np.arange(0., 180., 0.5), labels=[0,0,0,1])

    x_insitu, y_insitu = m(insitu_loc[1], insitu_loc[0])
    x_weather, y_weather = m(weather_loc[1], weather_loc[0])

    m.plot(x_insitu, y_insitu, 'bo', markersize=8, label='In-situ Location')
    m.plot(x_weather, y_weather, 'ro', markersize=8, label='Weather Location')

    plt.text(x_insitu+10000, y_insitu+10000, 'Flensburg In-Situ', fontsize=12, color='black')
    plt.text(x_weather+10000, y_weather+10000, 'Closest Point', fontsize=12, color='red')

    plt.legend(loc='upper left')
    plt.title('In-situ vs. Wetterpunkt')
    plt.show()

plot_locations(insitu_location, wetter_location)

# === 4. Zeitliche Synchronisierung ===
df_insitu = df_insitu.set_index("time").resample("h").mean(numeric_only=True)
wetter_df = wetter_df.resample("h").mean()

# === 5. Daten zusammenführen ===
merged_df = pd.merge(df_insitu, wetter_df, left_index=True, right_index=True, how="inner")
display(merged_df.head(2))
display(merged_df.tail(2))

# === 6. Explorative Analyse ===
features = ['sla', 'slev', 'wind_speed_10m', 'surface_pressure', 'precipitation', 'wind_direction_10m', 'vo']
#features = merged_df.columns.tolist()


sns.pairplot(merged_df[features], diag_kind='kde')
plt.show()

# === 7. Korrelationen anzeigen ===
correlation = merged_df.corr(numeric_only=True)
print("Korrelationsmatrix:")
display(correlation["slev"].sort_values(ascending=False))

# 10 grö0te Korrelationen
features = correlation["slev"].sort_values(ascending=False).nlargest(10).index.tolist()
features += correlation["slev"].sort_values(ascending=False).nsmallest(10).index.tolist()
print(features)

# === 8. Zeitreihenvisualisierung ===
merged_df[features].plot(
    subplots=True, figsize=(10, 17), title=f"Zeitreihen from {time_string}")
plt.tight_layout()
plt.legend(loc='upper left')
plt.xlabel("Time")
plt.show()

# === 9. Lineare Regression ===
def run_regression(df, target_col, feature_cols):
    X = df[feature_cols]
    y = df[target_col]

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    #X_scaled = X

    model = LinearRegression()
    model.fit(X_scaled, y)

    print("\nRegressionskoeffizienten:")
    for feat, coef in zip(feature_cols, model.coef_):
        print(f"{feat}: {coef:.4f}")
    print(f"R² Score: {model.score(X_scaled, y):.4f}")

run_regression(merged_df, "slev", features[1:])


In [None]:
features

In [None]:
ds_merged_sub_sub

In [None]:
dd

In [None]:
def plot_scatter(ds_merged, x_col, y_col, c, title, xlabel, ylabel, dim=['latitude', 'longitude']):
    plt.figure(figsize=(10, 6))
    plt.scatter(ds_merged[x_col].mean(dim=dim), ds_merged[y_col].mean(dim=dim), c=ds_merged[c].mean(dim=dim), cmap='viridis', alpha=0.5)
    plt.colorbar(label=c)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

plot_scatter(ds_merged=ds_merged_sub, 
             x_col='wind_speed_10m', 
             y_col='sla',
             c='sla',
            title='SLA vs Pressure MSL',
            xlabel='SLA [m]',
            ylabel='sla',
            )


# col1 = ds_merged['sla'].mean(dim=['latitude', 'longitude'])
# col2 = ds_merged['wind_speed_10m'].mean(dim=['latitude', 'longitude'])

# plt.scatter(col1, col2, alpha=0.5, marker='o', cmap='viridis', c=ds_merged['wind_direction_10m'].mean(dim=['latitude', 'longitude']))
# plt.xlabel('mean SLA [m]')
# plt.ylabel('mean Wind Speed [m/s]')
# plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr


def plot_contour(ds_merged, x_col, y_col, c, title, xlabel, ylabel, bins=500, dim=['latitude', 'longitude']):
    """
    Erzeugt einen Contourplot von Mittelwerten über angegebene Dimensionen (z. B. lat/lon).
    """
    # Mittelwerte über Raumdimensionen
    x = ds_merged[x_col].mean(dim=dim).values.flatten()
    y = ds_merged[y_col].mean(dim=dim).values.flatten()
    z = ds_merged[c].mean(dim=dim).values.flatten()

    # Entferne NaNs
    mask = ~np.isnan(x) & ~np.isnan(y) & ~np.isnan(z)
    x, y, z = x[mask], y[mask], z[mask]

    # Erzeuge ein 2D-Gitter durch Histogramm-Binning
    xi = np.linspace(np.min(x), np.max(x), bins)
    yi = np.linspace(np.min(y), np.max(y), bins)
    Xi, Yi = np.meshgrid(xi, yi)

    # Interpolation der Z-Werte auf das Gitter
    from scipy.interpolate import griddata
    Zi = griddata((x, y), z, (Xi, Yi), method='linear')

    # Plot
    plt.figure(figsize=(10, 6))
    contour = plt.contourf(Xi, Yi, Zi, levels=50, cmap='viridis')
    cbar = plt.colorbar(contour)
    cbar.set_label(c)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.show()

plot_contour(ds_merged=ds_merged, 
             x_col='wind_speed_10m', 
             y_col='wind_direction_10m',
             c='sla',
             title='Contourplot: SLA vs Pressure MSL',
             xlabel='SLA [m]',
             ylabel='Pressure MSL [hPa]',
             bins=100,
            )


In [None]:
dd

In [None]:
ds_interp = ds_weather.interpolate_na(dim="latitude", method="linear")
ds_interp = ds_interp.interpolate_na(dim="longitude", method="linear")
ds_interp

In [None]:
import numpy as np

# Altes Gitter
old_lats = ds_interp.latitude
old_lons = ds_interp.longitude

# Neues feineres Gitter erzeugen (z. B. 0.25° statt 1.0° Auflösung)
new_lats = np.arange(old_lats.min(), old_lats.max(), 0.25)
new_lons = np.arange(old_lons.min(), old_lons.max(), 0.25)

# Interpolation
ds_interp = ds_interp.interp(latitude=new_lats, longitude=new_lons, method='linear')


In [None]:
ds_interp

In [None]:
import matplotlib.pyplot as plt

# Beispiel: Temperatur auswählen
temperature = ds_interp['pressure_msl'].sel(time='2023-10-18T12:00:00')

# Plot
fig = plt.figure(figsize=(10, 6))
ax = plt.axes(projection=ccrs.PlateCarree())  # oder z. B. ccrs.Mercator()

temperature.plot(ax=ax, transform=ccrs.PlateCarree(), cmap='coolwarm', cbar_kwargs={'label': '°C'})

# Extras: Küstenlinien etc.
ax.coastlines()
ax.add_feature(cfeature.BORDERS)
ax.set_title("Temperaturkarte")

plt.show()


In [None]:
temperature = ds_interp['pressure_msl'].sel(time='2023-10-18T12:00:00')

fig, ax = plt.subplots(figsize=(10, 6), subplot_kw={'projection': ccrs.PlateCarree()})
ax.coastlines()
ax.add_feature(cfeature.BORDERS)

# Konturlinien
cs = ax.contourf(temperature.longitude, temperature.latitude, temperature, 
                 levels=20, cmap='coolwarm',  # <- Korrektur hier
                 linewidths=1, transform=ccrs.PlateCarree())


ax.set_title("Konturlinien des Luftdrucks")
plt.show()


In [None]:
ds_interp

In [None]:
u = ds_interp['u10'].sel(time='2023-10-18T12:00:00')  # Ost-Komponente
v = ds_interp['v10'].sel(time='2023-10-18T12:00:00')  # Nord-Komponente

fig, ax = plt.subplots(figsize=(10, 6), subplot_kw={'projection': ccrs.PlateCarree()})
ax.coastlines()
ax.add_feature(cfeature.BORDERS)

# Vektorpfeile (Wind)
q = ax.quiver(u.lon[::5], u.lat[::5], u[::5, ::5], v[::5, ::5], transform=ccrs.PlateCarree(), scale=700)

ax.set_title("Windvektoren")
plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(10, 6), subplot_kw={'projection': ccrs.PlateCarree()})
ax.coastlines()

ax.streamplot(u.lon, u.lat, u.values, v.values, transform=ccrs.PlateCarree(), color='blue', density=1.5)
ax.set_title("Stromlinien der Windrichtung")
plt.show()


In [None]:
temp = ds_interp['temperature_2m'].mean(dim=['latitude', 'longitude'])
humidity = ds_interp['relative_humidity_2m'].mean(dim=['latitude', 'longitude'])

plt.scatter(temp, humidity, alpha=0.5)
plt.xlabel('Temperatur [°C]')
plt.ylabel('relative Luftfeuchtigkeit [%]')
plt.show()


In [None]:
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr

# Beispielzeitpunkt auswählen
time_sel = '2023-10-20T12:00'

# Daten selektieren
temp = ds_interp['relative_humidity_2m'].sel(time=time_sel)
wind_speed = ds_interp['wind_speed_10m'].sel(time=time_sel)
wind_dir = ds_interp['wind_direction_10m'].sel(time=time_sel)

# Windrichtung und -geschwindigkeit → u, v-Komponenten umrechnen
wind_u = wind_speed * -np.sin(np.deg2rad(wind_dir))
wind_v = wind_speed * -np.cos(np.deg2rad(wind_dir))

# Plot erstellen
fig, ax = plt.subplots(figsize=(12, 8), subplot_kw={'projection': ccrs.PlateCarree()})

# Temperaturkarte (colormap)
temp.plot(ax=ax, transform=ccrs.PlateCarree(), cmap='coolwarm', cbar_kwargs={'label': 'Temperatur [°C]'})

# Küstenlinie, Ländergrenzen etc.
ax.coastlines()
ax.add_feature(cfeature.BORDERS, linestyle=':')
ax.add_feature(cfeature.LAND, facecolor='lightgray', alpha=0.3)

# Windvektoren
# Downsamplen für bessere Übersicht (z. B. jeden 3. Punkt)
step = 2
lat = ds_interp.latitude[::step]
lon = ds_interp.longitude[::step]
u = wind_u[::step, ::step]
v = wind_v[::step, ::step]

ax.quiver(lon, lat, u, v, transform=ccrs.PlateCarree(), color='black', scale=700)

# Titel
ax.set_title(f"Temperatur und Wind am {time_sel}", fontsize=14)
plt.tight_layout()
plt.show()


In [None]:
dd

# Merge Dataframes

In [None]:
import pandas as pd


def process_coord(df: pd.DataFrame, coord: tuple) -> pd.DataFrame:
    """
    Processes a subset of the DataFrame for a given coordinate.
    """
    df_sub = df[df['position'] == coord].copy()
    df_sub.drop(columns=['latitude', 'longitude', 'position'], inplace=True)

    value_columns = df_sub.columns.difference(['time'])
    df_sub.rename(
        columns={col: f"{col}_{coord}" for col in value_columns},
        inplace=True
    )

    return df_sub

def convert_df_joblib(df: pd.DataFrame, n_jobs: int = -1) -> pd.DataFrame:
    """
    Converts the DataFrame by pivoting values for unique coordinates
    using parallel processing via joblib.
    """
    df['position'] = list(zip(df['latitude'], df['longitude']))
    unique_coords = df['position'].unique()
    print(f"Number of unique coordinates: {len(unique_coords)}")

    df_merged = pd.DataFrame({'time': df['time'].unique()})

    # Parallel processing
    results = Parallel(n_jobs=n_jobs)(
        delayed(process_coord)(df, coord) for coord in tqdm(unique_coords)
    )

    # Merge all partial DataFrames
    for df_sub in results:
        df_merged = df_merged.merge(df_sub, on='time', how='left')

    return df_merged

# Usage
df_ocean_converted = convert_df_joblib(df_ocean)
df_weather_converted = convert_df_joblib(df_weather)



In [None]:
# Merge df_ocean and df_weather and df_insitu
df_merged = df_ocean_converted.merge(df_weather_converted, on='time', how='inner')
df_merged = df_merged.merge(df_insitu, on='time', how='inner')

In [None]:
df_merged.shape

In [None]:
df_merged.columns

In [None]:
# correlation matrix
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

corr = df_merged.corr()
# mask = np.triu(np.ones_like(corr, dtype=bool))
# sns.heatmap(corr, mask=mask, cmap="coolwarm", annot=False, fmt=".2f", square=True, cbar_kws={"shrink": .8})
# plt.title("Correlation Matrix")
# plt.show()





In [None]:
# Find the highest correlation between the columns and slev
corr_slev = corr["slev"].nlargest(100)


# Display the correlation values in a bar plot
plt.figure(figsize=(12, 8))
sns.barplot(x=corr_slev.index, y=corr_slev.values)
plt.xticks(rotation=90)
plt.title("Correlation with slev")
plt.xlabel("Features")
plt.ylabel("Correlation")
plt.tight_layout()
plt.show()

In [None]:
corr_slev.index 

In [None]:
# Find the highest correlation between the columns and slev
corr_slev = corr["slev"].nsmallest(100)


# Display the correlation values in a bar plot
plt.figure(figsize=(12, 8))
sns.barplot(x=corr_slev.index, y=corr_slev.values)
plt.xticks(rotation=90)
plt.title("Correlation with slev")
plt.xlabel("Features")
plt.ylabel("Correlation")
plt.tight_layout()
plt.show()

In [None]:
def get_index_name(corr_slev: pd.Series, name: str) -> str:
    """
    Get the first index name that contains the specified substring.
    """
    for col in corr_slev.index:
        if name in col:
            return col
    return None

# Get the index name of the column that contains 'wind'
sla_col = get_index_name(corr_slev, "sla")
wo_col = get_index_name(corr_slev, "wo")
pressure_msl_col = get_index_name(corr_slev, "pressure_msl")
surface_pressure_col = get_index_name(corr_slev, "surface_pressure")
print(f"Wind column: {sla_col}")


In [None]:
import pandas as pd


def scale_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Scale the DataFrame using MinMaxScaler, excluding the 'time' column.
    """
    scaler = MinMaxScaler()
    
    # Save time column and drop it from the data to be scaled
    time = df["time"]
    data_to_scale = df.drop(columns=["time"])

    # Fit and transform the data (excluding 'time')
    scaled_values = scaler.fit_transform(data_to_scale)

    # Create scaled DataFrame
    df_scaled = pd.DataFrame(scaled_values, columns=data_to_scale.columns)
    df_scaled["time"] = time

    return df_scaled

# Beispiel-Nutzung
df_scaled = scale_df(df_merged)


In [None]:
def get_position_of_column(df: pd.DataFrame, col_name: str) -> tuple:
    """
    Get the position (latitude, longitude) of a specific column in the DataFrame.
    """
    # Split the column name to extract latitude and longitude
    parts = col_name.split("_")[-1]
    parts = eval(parts)
    lat = float(parts[-2])
    lon = float(parts[-1])
    return lat, lon

In [None]:
pressure_msl_col

In [None]:
# Plot slev

# normalize the like MinMaxScaler


fig, ax = plt.subplots(figsize=(12, 6))

# Plot the water level
ax.plot(df_scaled['time'], df_scaled['slev'], label='Wasserstand', color='blue')

# plot the sla_col 
#ax.plot(df_scaled['time'], df_scaled[sla_col], label=sla_col, color='red')

ax.plot(df_scaled['time'], df_scaled[wo_col], label=wo_col, color='green')

#ax.plot(df_scaled['time'], df_scaled[pressure_msl_col], label=pressure_msl_col, color='orange')


# plot the position of the column on map
lat, lon = get_position_of_column(df_scaled, wo_col)
print(f"Position of {wo_col}: {lat}, {lon}")
# Create a Basemap
fig_map, ax_map = plt.subplots(figsize=(12, 10))
m = Basemap(
    projection="cyl",
    resolution="i",
    llcrnrlon=lon_grid.min(),
    urcrnrlon=lon_grid.max(),
    llcrnrlat=lat_grid.min(),
    urcrnrlat=lat_grid.max(),
    ax=ax_map,
)
# Draw map features
m.drawcoastlines()
m.drawcountries()
m.fillcontinents(color="0.8")
m.drawstates()
m.drawmapboundary(fill_color="aqua")
m.fillcontinents(color="coral", lake_color="aqua", alpha=0.2)
# Scatterplot for ocean data
x, y = m(lon, lat)
m.scatter(x, y, color="blue", label="Ocean Data", zorder=5)
# Add a title and legend
plt.title(f"Position of {pressure_msl_col} on map")
plt.legend(loc="upper left")
plt.show()

