In [1]:
import fastapi
import uvicorn
from utils.Database import Database
from utils.Copernicus import AdvancedCopernicus
from utils.OpenMeteoWeather import OpenMeteoWeather
from utils.PlanetPositions import PlanetPositions
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
import json


In [2]:
print(datetime.datetime.now().isoformat())

2025-02-23T12:22:13.052445


In [3]:
# ------------ Initialize the global Variables ------------

# Define Absolute maximum and minimum values for date and location
ABSOLUTE_END_DATE:str = datetime.datetime.now().isoformat().split("T")[0]

ABSOLUTE_MINIMUM_LONGITUDE:float = 9.041532516479492
ABSOLUTE_MAXIMUM_LONGITUDE:float = 30.208656311035156
ABSOLUTE_MINIMUM_LATIDUDE:float = 53.00829315185547
ABSOLUTE_MAXIMUM_LATIDUDE:float = 65.89141845703125




START_DATE:str = "2025-01-01"
END_DATE:str = "2025-02-1"



MINIMUM_LONGITUDE:float = 9.59
MAXIMUM_LONGITUDE:float = 12.037
MINIMUM_LATIDUDE:float = 55.486
MAXIMUM_LATIDUDE:float = 55.616


# For Testing
MINIMUM_LONGITUDE=10.038345850696412
MAXIMUM_LONGITUDE=10.365962458698567
MINIMUM_LATIDUDE=54.27381478077755
MAXIMUM_LATIDUDE=54.52976525577923

OUTPUT_FILENAME:str = "output.nc"




DB_URL = 'localhost'
DB_NAME = 'deep-learning'
DB_COLLECTION = 'ozean-weather'



In [4]:
# ------------ Initialize the classes ------------


AdvancedCopernicus = AdvancedCopernicus()


In [5]:

def process_dataframe(df:pd.DataFrame) -> pd.DataFrame:
    for column in df.select_dtypes(include=["float"]).columns:
        df[column] = df[column].astype(np.float32)  # Konvertiere alle Float-Typen zu float32
    df["latitude"] = df["latitude"].astype(np.float32).round(6)
    df["longitude"] = df["longitude"].astype(np.float32).round(6)
    df['time'] = pd.to_datetime(df['time']).dt.tz_localize(None)

    return df



In [6]:
# ------------ Get data from AdvancedCopernicus ------------
copernicus_data = AdvancedCopernicus.get_subset(
                dataset_id="cmems_mod_bal_phy_anfc_PT1H-i",
                dataset_version="202411",
                variables=["bottomT", "mlotst", "siconc", "sithick", "sla", "so", "sob", "thetao", "uo", "vo", "wo"], 
                minimum_longitude=MINIMUM_LONGITUDE,
                maximum_longitude=MAXIMUM_LONGITUDE,
                minimum_latitude=MINIMUM_LATIDUDE,
                maximum_latitude=MAXIMUM_LATIDUDE,
                start_datetime=START_DATE,
                end_datetime=END_DATE,
                minimum_depth=0.5016462206840515,
                maximum_depth=0.5016462206840515,
                coordinates_selection_method="strict-inside",
                disable_progress_bar=False,
                output_filename=OUTPUT_FILENAME
                )
            



INFO - 2025-02-23T11:22:14Z - Selected dataset version: "202411"
INFO - 2025-02-23T11:22:14Z - Selected dataset part: "default"
INFO - 2025-02-23T11:22:19Z - Starting download. Please wait...


  0%|          | 0/44 [00:00<?, ?it/s]

INFO - 2025-02-23T11:22:22Z - Successfully downloaded to output.nc


In [7]:
# plot xarray dataset in map
print(copernicus_data)


<xarray.Dataset> Size: 6MB
Dimensions:    (depth: 1, latitude: 16, longitude: 12, time: 745)
Coordinates:
  * depth      (depth) float32 4B 0.5016
  * latitude   (latitude) float32 64B 54.27 54.29 54.31 ... 54.49 54.51 54.52
  * longitude  (longitude) float32 48B 10.04 10.07 10.1 ... 10.29 10.32 10.35
  * time       (time) datetime64[ns] 6kB 2025-01-01 ... 2025-02-01
Data variables:
    bottomT    (time, latitude, longitude) float32 572kB ...
    mlotst     (time, latitude, longitude) float32 572kB ...
    siconc     (time, latitude, longitude) float32 572kB ...
    sithick    (time, latitude, longitude) float32 572kB ...
    sla        (time, latitude, longitude) float32 572kB ...
    so         (time, depth, latitude, longitude) float32 572kB ...
    sob        (time, latitude, longitude) float32 572kB ...
    thetao     (time, depth, latitude, longitude) float32 572kB ...
    uo         (time, depth, latitude, longitude) float32 572kB ...
    vo         (time, depth, latitude, longi

In [8]:
df_copernicus = copernicus_data.to_dataframe().reset_index()
# put column time in front
df_copernicus = df_copernicus[["time"] + [col for col in df_copernicus.columns if col != "time"]]

df_copernicus.dropna(axis=0, 
                     subset=["bottomT", "mlotst", "siconc", "sithick", "sla", "so", "sob", "thetao", "uo", "vo", "wo"], 
                     inplace=True,
                     how="all")

df_copernicus = process_dataframe(df_copernicus)

display(df_copernicus.info())

display(df_copernicus)

<class 'pandas.core.frame.DataFrame'>
Index: 49915 entries, 47680 to 143039
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   time       49915 non-null  datetime64[ns]
 1   depth      49915 non-null  float32       
 2   latitude   49915 non-null  float32       
 3   longitude  49915 non-null  float32       
 4   bottomT    49915 non-null  float32       
 5   mlotst     49915 non-null  float32       
 6   siconc     49915 non-null  float32       
 7   sithick    49915 non-null  float32       
 8   sla        49915 non-null  float32       
 9   so         49915 non-null  float32       
 10  sob        49915 non-null  float32       
 11  thetao     49915 non-null  float32       
 12  uo         49915 non-null  float32       
 13  vo         49915 non-null  float32       
 14  wo         49915 non-null  float32       
dtypes: datetime64[ns](1), float32(14)
memory usage: 3.4 MB


None

Unnamed: 0,time,depth,latitude,longitude,bottomT,mlotst,siconc,sithick,sla,so,sob,thetao,uo,vo,wo
47680,2025-01-01 00:00:00,0.501646,54.358276,10.152689,6.129929,5.590637,0.0,0.0,-0.783050,16.313734,16.313805,6.129667,0.007948,0.000000,0.000015
47681,2025-01-01 01:00:00,0.501646,54.358276,10.152689,6.123942,5.537246,0.0,0.0,-0.836441,16.280558,16.280558,6.123942,0.004320,0.000000,0.000016
47682,2025-01-01 02:00:00,0.501646,54.358276,10.152689,6.119880,5.485316,0.0,0.0,-0.888371,16.244640,16.244654,6.119825,0.004066,0.000000,0.000016
47683,2025-01-01 03:00:00,0.501646,54.358276,10.152689,6.118432,5.438865,0.0,0.0,-0.934821,16.206421,16.206537,6.118350,0.003737,0.000000,0.000016
47684,2025-01-01 04:00:00,0.501646,54.358276,10.152689,6.119393,5.390626,0.0,0.0,-0.983061,16.168251,16.168253,6.119394,0.003704,0.000000,0.000017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143035,2025-01-31 20:00:00,0.501646,54.524940,10.347133,4.567245,10.031350,0.0,0.0,-0.111337,17.642529,19.848560,4.294259,0.104553,-0.137242,0.000002
143036,2025-01-31 21:00:00,0.501646,54.524940,10.347133,4.565339,10.018709,0.0,0.0,-0.127637,17.571630,19.834522,4.283749,0.103176,-0.145167,0.000002
143037,2025-01-31 22:00:00,0.501646,54.524940,10.347133,4.563229,10.015547,0.0,0.0,-0.131714,17.503935,19.819601,4.271446,0.090108,-0.148308,0.000001
143038,2025-01-31 23:00:00,0.501646,54.524940,10.347133,4.560693,10.021186,0.0,0.0,-0.124443,17.430189,19.804668,4.257999,0.060021,-0.145717,0.000001


In [9]:
db = Database(
    db_url=DB_URL,
    db_name=DB_NAME,
    collection_name=DB_COLLECTION
    )
    

db_data_all = db.get_all_data(key="time")
db.close_connection()

if db_data_all:
    df_db = pd.DataFrame(db_data_all).drop(columns=['_id']).loc[:, ['time', 'latitude', 'longitude']]


    df_db = process_dataframe(df_db)

    display(df_copernicus.shape)

    # Filtere Zeilen, die in df_db existieren
    db_tuples = set(zip(df_db["time"], df_db["latitude"], df_db["longitude"]))
    df_copernicus = df_copernicus[~df_copernicus.apply(lambda row: (row["time"], row["latitude"], row["longitude"]) in db_tuples, axis=1)]
    display(df_copernicus.shape)

In [10]:
# Helper Function
def upload_article_if_new(db_data, not_db_data):
    # Check if the article is already in the database
    for doc in db_data:
        if (doc.get('time') == not_db_data.get('time')) and (doc.get('latitude') == not_db_data.get('latitude')) and (doc.get('longitude') == not_db_data.get('longitude')):
            #print('Data already in the database, skipping upload...\n')
            return False
        
    return True

In [11]:
# cols = [
#     "time", "temperature_2m", "relative_humidity_2m", "dew_point_2m", "apparent_temperature", 
#     "precipitation_probability", "precipitation", "rain", "showers", "snowfall", "snow_depth",
#     "weather_code", "pressure_msl", "surface_pressure", "cloud_cover", "cloud_cover_low", 
#     "cloud_cover_mid", "cloud_cover_high", "visibility", "evapotranspiration", 
#     "et0_fao_evapotranspiration", "vapour_pressure_deficit", "wind_speed_10m", "wind_speed_80m", 
#     "wind_speed_120m", "wind_speed_180m", "wind_direction_10m", "wind_direction_80m", 
#     "wind_direction_120m", "wind_direction_180m", "wind_gusts_10m", "temperature_80m", 
#     "temperature_120m", "temperature_180m", "soil_temperature_0cm", "soil_temperature_6cm", 
#     "soil_temperature_18cm", "soil_temperature_54cm", "soil_moisture_0_to_1cm", 
#     "soil_moisture_1_to_3cm", "soil_moisture_3_to_9cm", "soil_moisture_9_to_27cm", 
#     "soil_moisture_27_to_81cm"]


for idx, (time, latitude, longitude) in enumerate(tqdm(zip(df_copernicus['time'], df_copernicus['latitude'], df_copernicus['longitude']), desc='Processing data', total=len(df_copernicus))):
    

    # bring time to isoformat
    time = time.isoformat().split('T')[0]
    
    open_meteo_weather = OpenMeteoWeather(
    latitude=[latitude],
    longitude=[longitude],
    start_date=time,
    end_date=time
    ) 

    df_openweather = open_meteo_weather.get_weather_dataframe().rename(columns={"date": "time"})
    df_openweather['time'] = df_openweather['time'].dt.tz_localize(None) # convert datetime64[ns, UTC] to datetime64[ns]
    df_openweather['latitude'] = latitude
    df_openweather['longitude'] = longitude
    
    df_merged = pd.merge(df_copernicus, df_openweather, on=['time', 'latitude', 'longitude'], how='inner')
    df_merged = process_dataframe(df_merged)
    
    # upload to database
    df_json = df_merged.to_json(orient='records')
    df_json = json.loads(df_json)
    for item in df_json:
        item["time"] = pd.to_datetime(item["time"], unit='ms')


        db = Database(
            db_url=DB_URL,
            db_name=DB_NAME,
            collection_name=DB_COLLECTION
            )

        db_data_all = db.get_all_data(key="time")
        if upload_article_if_new(db_data_all, item) == False:
            continue

        db.upload_one(item)
        db.close_connection()

Processing data:   0%|          | 0/49915 [00:00<?, ?it/s]

Processing data:   1%|          | 384/49915 [03:27<7:26:53,  1.85it/s] 


KeyboardInterrupt: 