In [1]:

import warnings
import pandas as pd
from utils.Copernicus import AdvancedCopernicus
import datetime
import os
import xarray as xr
from tqdm import tqdm
# Ignore SettingWithCopyWarning:
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)



# Display all columns
pd.options.display.max_columns = None
#pd.options.display.max_rows = None

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
filenames = os.listdir("data")
files = []
for file in filenames:
    if any(substring in file for substring in ['BO_PR_CT', 'BO_PR_BO', 'BO_TS_FB', 'BO_TS_MO', 'BO_TS_TG', 'NO_TS_MO', 'NO_TS_RF', 'NO_TS_TG', 'NO_TS_VA', 'NO_TS_XX']):
        files.append(file)
        
print(len(files))
files

2174


['BO_PR_CT_LHEI174.nc',
 'NO_TS_RF_ShardlowFlow.nc',
 'BO_TS_TG_Karrebaeksminde.nc',
 'NO_TS_TG_TeignbridgePierTG.nc',
 'BO_TS_MO_Fladen.nc',
 'BO_TS_FB_FinnSea_2014.nc',
 'NO_TS_MO_ZeebruggeZandopvangkadeBuoy.nc',
 'BO_PR_CT_STO0802008.nc',
 'NO_TS_RF_Mimram-Panshanger.nc',
 'BO_TS_TG_Lielupe.nc',
 'BO_TS_MO_Nordvalen.nc',
 'BO_TS_FB_StenaAdventurer_2017.nc',
 'BO_PR_CT_DMU427.nc',
 'BO_TS_FB_StenaSpirit_2020.nc',
 'BO_PR_CT_FYN6100016.nc',
 'BO_PR_CT_VEJ0003750.nc',
 'NO_TS_TG_KeizersveerTG.nc',
 'BO_PR_CT_SMHIBY15.nc',
 'NO_TS_RF_SintPieter.nc',
 'NO_TS_MO_6201572.nc',
 'BO_PR_CT_KBH1723.nc',
 'NO_TS_RF_Bunde.nc',
 'BO_PR_BO_SMHIA13BO.nc',
 'BO_PR_CT_SJY12A.nc',
 'NO_TS_MO_ZeebruggeWeather.nc',
 'NO_TS_RF_Randersfjord.nc',
 'BO_PR_CT_SMHIBO3A3.nc',
 'BO_PR_CT_ROS60.nc',
 'BO_TS_FB_StenaGermanica_2020.nc',
 'NO_TS_TG_VardoeTG.nc',
 'BO_TS_MO_Hoburg1.nc',
 'NO_TS_RF_Olst.nc',
 'BO_PR_CT_SYKEBCSIII10.nc',
 'BO_PR_CT_FYN6300051.nc',
 'NO_TS_TG_RoompotBuitenTG.nc',
 'NO_TS_TG_LeirvikTG.n

In [None]:
df_all = pd.DataFrame()
end_date = pd.to_datetime("2025-03-01")
start_date = pd.to_datetime("2023-01-01")

for file in tqdm(files, desc='Processing files'):
    f = os.path.join("data", file)
    ds = xr.open_dataset(f)
    df = ds.to_dataframe().reset_index()
    df['TIME'] = pd.to_datetime(df['TIME'])

    # Filter das dataframe nach dem Startdatum
    df = df.loc[df['TIME'] >= start_date]

    # wenn das DataFrame daten bis zum 01.03.2025 enthält, dann füge es zum df_all hinzu
    if (df['TIME'].max() >= end_date) and ('SLEV' in df.columns):
        df_all = pd.concat([df_all, df], axis=0)
    



Processing files:   0%|          | 0/2174 [00:00<?, ?it/s]

Processing files:  36%|███▌      | 784/2174 [06:41<07:21,  3.15it/s]  

In [11]:
# Just take QC = 1
df_qc = df_all[df_all['SLEV_QC'] == 1]

In [12]:
display(df_qc.describe())

Unnamed: 0,TIME,DEPTH,TIME_QC,LATITUDE,LONGITUDE,DEPH,DEPH_QC,SLEV,SLEV_QC,TEMP,TEMP_QC,VEMH,VEMH_QC,VHM0,VHM0_QC,VTPK,VTPK_QC,HCDT,HCDT_QC,HCSP,HCSP_QC
count,10096765,10096765.0,10096765.0,10096760.0,10096760.0,10096765.0,10096760.0,10096760.0,10096765.0,3056547.0,3293594.0,3558.0,36422.0,3558.0,36422.0,0.0,36422.0,7572.0,34015.0,7573.0,34015.0
mean,2024-01-31 05:42:16.038312192,0.0,1.0,57.16698,12.90931,0.0,6.979787,0.1482331,1.0,9.554147,1.610145,0.018297,8.480643,0.012114,8.225468,,9.0,174.60347,7.657916,0.381765,7.219315
min,2023-01-01 00:00:00,0.0,1.0,54.3997,9.75,0.0,1.0,-1.58,1.0,-2.0,1.0,0.0,1.0,0.0,1.0,,9.0,0.01,1.0,0.0,1.0
25%,2023-07-20 19:00:00,0.0,1.0,55.7167,11.4833,0.0,7.0,0.0,1.0,3.9,1.0,0.0,9.0,0.0,9.0,,9.0,72.902504,9.0,0.201,9.0
50%,2024-01-29 00:20:00,0.0,1.0,57.6967,11.9869,0.0,7.0,0.14,1.0,8.700001,1.0,0.0,9.0,0.0,9.0,,9.0,187.025009,9.0,0.303,9.0
75%,2024-08-11 20:30:00,0.0,1.0,57.7658,12.1125,0.0,7.0,0.28,1.0,15.8,1.0,0.0,9.0,0.0,9.0,,9.0,268.157532,9.0,0.504,9.0
max,2025-03-01 03:46:00,0.0,1.0,65.7888,30.2667,0.0,7.0,2.15,1.0,31.44,9.0,0.5,9.0,0.5,9.0,,9.0,359.950012,9.0,1.96,9.0
std,,0.0,0.0,1.763371,3.593037,0.0,0.3476663,0.2268519,0.0,6.458345,2.076777,0.053125,1.60223,0.041709,2.356834,,0.0,105.690224,2.596463,0.267668,3.327589


In [13]:
df_qc.dropna(axis=1, how='all', inplace=True)
df_qc.dropna(axis=0, how='all', inplace=True)

In [14]:
df_qc.info()
df_qc.describe()
df_qc.head()

<class 'pandas.core.frame.DataFrame'>
Index: 10096765 entries, 458251 to 1499342
Data columns (total 25 columns):
 #   Column     Dtype         
---  ------     -----         
 0   TIME       datetime64[ns]
 1   DEPTH      int64         
 2   TIME_QC    float32       
 3   LATITUDE   float32       
 4   LONGITUDE  float32       
 5   DEPH       float32       
 6   DEPH_QC    float32       
 7   STATION    object        
 8   SLEV       float32       
 9   SLEV_QC    float32       
 10  TEMP       float32       
 11  TEMP_QC    float32       
 12  VEMH       float32       
 13  VEMH_QC    float32       
 14  VHM0       float32       
 15  VHM0_QC    float32       
 16  VTPK_QC    float32       
 17  TEMP_DM    object        
 18  VEMH_DM    object        
 19  VHM0_DM    object        
 20  VTPK_DM    object        
 21  HCDT       float32       
 22  HCDT_QC    float32       
 23  HCSP       float32       
 24  HCSP_QC    float32       
dtypes: datetime64[ns](1), float32(18), int64(1),

Unnamed: 0,TIME,DEPTH,TIME_QC,LATITUDE,LONGITUDE,DEPH,DEPH_QC,STATION,SLEV,SLEV_QC,TEMP,TEMP_QC,VEMH,VEMH_QC,VHM0,VHM0_QC,VTPK_QC,TEMP_DM,VEMH_DM,VHM0_DM,VTPK_DM,HCDT,HCDT_QC,HCSP,HCSP_QC
458251,2023-01-01 00:00:00.000000000,0,1.0,55.1833,11.65,0.0,7.0,b'Karrebaeksminde',0.04,1.0,,,,,,,,,,,,,,,
458252,2023-01-01 00:10:00.000000000,0,1.0,55.1833,11.65,0.0,7.0,b'Karrebaeksminde',0.03,1.0,,,,,,,,,,,,,,,
458253,2023-01-01 00:20:00.000000256,0,1.0,55.1833,11.65,0.0,7.0,b'Karrebaeksminde',0.03,1.0,,,,,,,,,,,,,,,
458254,2023-01-01 00:30:00.000000000,0,1.0,55.1833,11.65,0.0,7.0,b'Karrebaeksminde',0.04,1.0,,,,,,,,,,,,,,,
458255,2023-01-01 00:40:00.000000000,0,1.0,55.1833,11.65,0.0,7.0,b'Karrebaeksminde',0.03,1.0,,,,,,,,,,,,,,,


In [15]:
df_temp = df_qc.loc[df_qc['TEMP'].notna()]
df_temp

def plot_coordinates(df:pd.DataFrame, color:str="blue"):
    import folium

    df_map = df.groupby(by=["LATITUDE", "LONGITUDE"]).mean(numeric_only=True).reset_index()


    map_center = [df_map["LATITUDE"].mean(), df_map["LONGITUDE"].mean(numeric_only=True)]
    m = folium.Map(location=map_center, zoom_start=5)

    # Add markers
    for _, row in df_map.iterrows():
        folium.Marker(
            location=[row["LATITUDE"], row["LONGITUDE"]],
            #popup=f"Time: {row['time']}<br>Depth: {row['depth']}<br>BottomT: {row['bottomT']}",
            tooltip=f"Lat: {row['LATITUDE']}, Lon: {row['LONGITUDE']}",
            icon=folium.Icon(color=color)
        ).add_to(m)

    return m

plot_coordinates(df_temp, color="blue")



        

In [16]:
df_slev = df_qc.loc[df_qc['SLEV'].notna()]
df_slev

plot_coordinates(df_slev, color="red")

In [17]:
df_VEMH = df_qc.loc[df_qc['VEMH'].notna()]
df_VEMH

plot_coordinates(df_VEMH, color="green")

In [18]:
df_HCSP = df_qc.loc[df_qc['HCSP'].notna()]
df_HCSP

plot_coordinates(df_HCSP, color="purple")