In [57]:

from utils.Database import Database
from utils.Copernicus import AdvancedCopernicus
from utils.OpenMeteoWeather import OpenMeteoWeather
from utils.PlanetPositions import PlanetPositions
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
import json


# ------------ Initialize the global Variables ------------

# Define Absolute maximum and minimum values for date and location
ABSOLUTE_END_DATE:str = datetime.datetime.now().isoformat().split("T")[0]

ABSOLUTE_MINIMUM_LONGITUDE:float = 9.041532516479492
ABSOLUTE_MAXIMUM_LONGITUDE:float = 30.208656311035156
ABSOLUTE_MINIMUM_LATIDUDE:float = 53.00829315185547
ABSOLUTE_MAXIMUM_LATIDUDE:float = 65.89141845703125

START_DATE:str = "2025-01-01"
END_DATE:str = "2025-02-1"

MINIMUM_LONGITUDE:float = 9.59
MAXIMUM_LONGITUDE:float = 12.037
MINIMUM_LATIDUDE:float = 55.486
MAXIMUM_LATIDUDE:float = 55.616

# For Testing
MINIMUM_LONGITUDE=10.038345850696412
MAXIMUM_LONGITUDE=10.365962458698567
MINIMUM_LATIDUDE=54.27381478077755
MAXIMUM_LATIDUDE=54.52976525577923

OUTPUT_FILENAME:str = "output.nc"

DB_URL = 'localhost'
#DB_URL = 'host.docker.internal'
DB_NAME = 'deep-learning'
#DB_COLLECTION = 'ozean-weather-data'
DB_COLLECTION = 'test'




# ------------ Initialize the classes ------------
AdvancedCopernicus = AdvancedCopernicus()


# ------------ Helper Functions ------------

def process_dataframe(df:pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(by=["time", "latitude", "longitude"]).reset_index(drop=True)
    for column in df.select_dtypes(include=["float"]).columns:
        df[column] = df[column].astype(np.float32)  # Konvertiere alle Float-Typen zu float32
    df["latitude"] = df["latitude"].astype(np.float32).round(6)
    df["longitude"] = df["longitude"].astype(np.float32).round(6)
    
    df['time'] = pd.to_datetime(df['time'],format='%Y-%m-%d %H:%M:%S').dt.tz_localize(None).dt.round('h')
    #df['time'] = df['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
    

    
    return df


# ------------ Get data from AdvancedCopernicus ------------
print("\nGetting data from AdvancedCopernicus...\n")
copernicus_data = AdvancedCopernicus.get_subset(
                dataset_id="cmems_mod_bal_phy_anfc_PT1H-i",
                dataset_version="202411",
                variables=["bottomT", "mlotst", "siconc", "sithick", "sla", "so", "sob", "thetao", "uo", "vo", "wo"], 
                minimum_longitude=MINIMUM_LONGITUDE,
                maximum_longitude=MAXIMUM_LONGITUDE,
                minimum_latitude=MINIMUM_LATIDUDE,
                maximum_latitude=MAXIMUM_LATIDUDE,
                start_datetime=START_DATE,
                end_datetime=END_DATE,
                minimum_depth=0.5016462206840515,
                maximum_depth=0.5016462206840515,
                coordinates_selection_method="strict-inside",
                disable_progress_bar=False,
                output_filename=OUTPUT_FILENAME
                )
            


df_copernicus = copernicus_data.to_dataframe().reset_index()
# put column time in front
df_copernicus = df_copernicus[["time"] + [col for col in df_copernicus.columns if col != "time"]]

df_copernicus.dropna(axis=0, 
                     subset=["bottomT", "mlotst", "siconc", "sithick", "sla", "so", "sob", "thetao", "uo", "vo", "wo"], 
                     inplace=True,
                     how="all")

df_copernicus = process_dataframe(df_copernicus)



Getting data from AdvancedCopernicus...



INFO - 2025-02-23T20:10:17Z - Selected dataset version: "202411"
INFO - 2025-02-23T20:10:17Z - Selected dataset part: "default"
INFO - 2025-02-23T20:10:22Z - Starting download. Please wait...


  0%|          | 0/44 [00:00<?, ?it/s]

INFO - 2025-02-23T20:10:26Z - Successfully downloaded to output.nc


In [58]:
df_copernicus

Unnamed: 0,time,depth,latitude,longitude,bottomT,mlotst,siconc,sithick,sla,so,sob,thetao,uo,vo,wo
0,2025-01-01,0.501646,54.358276,10.152689,6.129929,5.590637,0.0,0.0,-0.783050,16.313734,16.313805,6.129667,0.007948,0.000000,1.474654e-05
1,2025-01-01,0.501646,54.358276,10.180467,6.161064,5.600132,0.0,0.0,-0.773554,15.677505,15.677505,6.161064,0.007948,0.016681,2.050040e-06
2,2025-01-01,0.501646,54.374943,10.180467,6.057038,5.604092,0.0,0.0,-0.769595,17.161772,17.162859,6.056765,0.026213,0.016681,1.141764e-06
3,2025-01-01,0.501646,54.374943,10.208244,6.073330,5.066607,0.0,0.0,-0.760337,17.871761,17.872192,6.072994,0.026213,0.035029,-1.742448e-06
4,2025-01-01,0.501646,54.391609,10.208244,6.222294,6.402395,0.0,0.0,-0.753243,18.472591,18.473347,6.221699,0.000000,0.035257,9.226017e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49910,2025-02-01,0.501646,54.524940,10.236022,4.605755,12.927186,0.0,0.0,-0.119356,19.095070,20.107540,4.447735,-0.051124,-0.050756,1.314921e-06
49911,2025-02-01,0.501646,54.524940,10.263800,4.586040,12.931259,0.0,0.0,-0.115282,18.563646,19.840172,4.352221,-0.039240,-0.075840,1.209104e-06
49912,2025-02-01,0.501646,54.524940,10.291577,4.547080,12.934582,0.0,0.0,-0.111960,18.127897,19.539555,4.277977,-0.017640,-0.101826,1.197222e-06
49913,2025-02-01,0.501646,54.524940,10.319355,4.526667,10.033236,0.0,0.0,-0.108906,17.703772,19.484066,4.246813,0.001270,-0.120941,1.101031e-06


In [59]:
# db = Database(
#     db_url=DB_URL,
#     db_name=DB_NAME,
#     collection_name=DB_COLLECTION
#     )
    

# db_data_all = db.get_all_data(key="time")
# db.close_connection()

# if db_data_all:
#     df_db = pd.DataFrame(db_data_all).drop(columns=['_id']).loc[:, ['time', 'latitude', 'longitude']]

#     df_db = process_dataframe(df_db)

#     # Filtere Zeilen, die in df_db existieren
#     db_tuples = set(zip(df_db["time"], df_db["latitude"], df_db["longitude"]))
#     df_copernicus = df_copernicus[~df_copernicus.apply(lambda row: (row["time"], row["latitude"], row["longitude"]) in db_tuples, axis=1)]
#     print(f'Reduced data {len(df_copernicus)}')


# # Helper Function
# def upload_article_if_new(db_data, not_db_data):
#     # Check if the article is already in the database
#     for doc in db_data:
#         if (doc.get('time') == not_db_data.get('time')) and (doc.get('latitude') == not_db_data.get('latitude')) and (doc.get('longitude') == not_db_data.get('longitude')):
#             #print('Data already in the database, skipping upload...\n')
#             return False
        
#     return True



In [60]:
# ignore SettingWithCopyWarning
pd.options.mode.chained_assignment = None

In [61]:

display(df_copernicus.head())
set_data = set(zip(df_copernicus["time"], df_copernicus["latitude"], df_copernicus["longitude"]))
print(list(set_data)[:10])

dd

Unnamed: 0,time,depth,latitude,longitude,bottomT,mlotst,siconc,sithick,sla,so,sob,thetao,uo,vo,wo
0,2025-01-01,0.501646,54.358276,10.152689,6.129929,5.590637,0.0,0.0,-0.78305,16.313734,16.313805,6.129667,0.007948,0.0,1.474654e-05
1,2025-01-01,0.501646,54.358276,10.180467,6.161064,5.600132,0.0,0.0,-0.773554,15.677505,15.677505,6.161064,0.007948,0.016681,2.05004e-06
2,2025-01-01,0.501646,54.374943,10.180467,6.057038,5.604092,0.0,0.0,-0.769595,17.161772,17.162859,6.056765,0.026213,0.016681,1.141764e-06
3,2025-01-01,0.501646,54.374943,10.208244,6.07333,5.066607,0.0,0.0,-0.760337,17.871761,17.872192,6.072994,0.026213,0.035029,-1.742448e-06
4,2025-01-01,0.501646,54.391609,10.208244,6.222294,6.402395,0.0,0.0,-0.753243,18.472591,18.473347,6.221699,0.0,0.035257,9.226017e-07


[(Timestamp('2025-01-26 02:00:00'), 54.47494125366211, 10.291577339172363), (Timestamp('2025-01-20 10:00:00'), 54.491607666015625, 10.180466651916504), (Timestamp('2025-01-30 16:00:00'), 54.42494583129883, 10.180466651916504), (Timestamp('2025-01-22 04:00:00'), 54.508277893066406, 10.291577339172363), (Timestamp('2025-01-11 05:00:00'), 54.491607666015625, 10.347132682800293), (Timestamp('2025-01-24 00:00:00'), 54.491607666015625, 10.124911308288574), (Timestamp('2025-01-16 01:00:00'), 54.524940490722656, 10.263799667358398), (Timestamp('2025-01-28 00:00:00'), 54.458274841308594, 10.347132682800293), (Timestamp('2025-01-23 14:00:00'), 54.508277893066406, 10.347132682800293), (Timestamp('2025-01-13 05:00:00'), 54.44160842895508, 10.291577339172363)]


NameError: name 'dd' is not defined

In [None]:
time_list = df_copernicus["time"].to_list()
lat_list = df_copernicus["latitude"].to_list()
lon_list = df_copernicus["longitude"].to_list()
# for idx, row in tqdm(df_copernicus.iterrows(), total=len(df_copernicus)):
    
#     time = time_list[idx]
#     latitude = lat_list
#     latitude = lon_list

#     time = time.isoformat().split('T')[0]
    
#     open_meteo_weather = OpenMeteoWeather(
#     latitudes=latitude,
#     longitudes=latitude,
#     start_date=time,
#     end_date=time
#     ) 
#     df_openweather = open_meteo_weather.get_weather_dataframe().rename(columns={"date": "time"})

#     break

# df_openweather
    

Processing weather data: 100%|██████████| 49532/49532 [01:22<00:00, 603.17it/s]
  0%|          | 0/49532 [03:15<?, ?it/s]


Unnamed: 0,latitude,longitude,time,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,precipitation_probability,precipitation,rain,...,temperature_180m,soil_temperature_0cm,soil_temperature_6cm,soil_temperature_18cm,soil_temperature_54cm,soil_moisture_0_to_1cm,soil_moisture_1_to_3cm,soil_moisture_3_to_9cm,soil_moisture_9_to_27cm,soil_moisture_27_to_81cm
0,10.152689,10.152689,2024-12-31 23:00:00,21.586000,28.690823,2.6360,18.164694,,0.0,0.0,...,,,,,,,,,,
1,10.152689,10.152689,2025-01-01 00:00:00,20.636000,30.306108,2.5860,17.269659,,0.0,0.0,...,,,,,,,,,,
2,10.152689,10.152689,2025-01-01 01:00:00,19.086000,34.321316,2.9860,16.142769,,0.0,0.0,...,,,,,,,,,,
3,10.152689,10.152689,2025-01-01 02:00:00,18.386000,35.603489,2.8860,15.410904,,0.0,0.0,...,,,,,,,,,,
4,10.152689,10.152689,2025-01-01 03:00:00,17.736000,37.087692,2.8860,14.771370,,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1188763,10.347133,10.347133,2025-01-01 18:00:00,27.253500,20.027182,2.3535,24.330475,,0.0,0.0,...,,,,,,,,,,
1188764,10.347133,10.347133,2025-01-01 19:00:00,26.053499,22.667931,3.1035,23.203867,,0.0,0.0,...,,,,,,,,,,
1188765,10.347133,10.347133,2025-01-01 20:00:00,25.653500,22.563240,2.7035,22.362225,,0.0,0.0,...,,,,,,,,,,
1188766,10.347133,10.347133,2025-01-01 21:00:00,24.453501,24.495115,2.8535,21.278814,,0.0,0.0,...,,,,,,,,,,


In [None]:
open_meteo_weather = OpenMeteoWeather(
latitudes=latitude,
longitudes=latitude,
start_date=time,
end_date=time
) 
df_openweather = open_meteo_weather.get_weather_dataframe().rename(columns={"date": "time"})


In [None]:
dd

NameError: name 'dd' is not defined

In [None]:
for time, lat, lon in tqdm(df_copernicus[['time', 'latitude', 'longitude']].values, total=len(df_copernicus)):
    print(time, lat, lon)
    break
    # data = df_copernicus[(df_copernicus['time'] == time) & (df_copernicus['latitude'] == lat) & (df_copernicus['longitude'] == lon)]
    # data = process_dataframe(data).to_dict(orient='records')[0]
    # data["time"] = datetime.datetime.fromtimestamp(data["time"].timestamp())

    # for doc in db_data_all:
    #     if (doc.get('time') == data.get('time')) and (doc.get('latitude') == data.get('latitude')) and (doc.get('longitude') == data.get('longitude')):
    #         print('Data already in the database, skipping upload...\n')
    #         break
    
    
    #display(data)
    #break

  0%|          | 0/49532 [00:00<?, ?it/s]

2025-01-16 23:00:00 54.3582763671875 10.152688980102539





In [None]:

print("\nParsing data to upload to Database...\n")
for idx, (time, latitude, longitude) in enumerate(tqdm(zip(df_copernicus['time'], df_copernicus['latitude'], df_copernicus['longitude']), desc='Processing data', total=len(df_copernicus))):

    # bring time to isoformat
    time = time.isoformat().split('T')[0]
    
    open_meteo_weather = OpenMeteoWeather(
    latitudes=[latitude],
    longitudes=[longitude],
    start_date=time,
    end_date=time
    ) 

    df_openweather = open_meteo_weather.get_weather_dataframe().rename(columns={"date": "time"})
    df_openweather['time'] = df_openweather['time'].dt.tz_localize(None) # convert datetime64[ns, UTC] to datetime64[ns]
    df_openweather['latitude'] = latitude
    df_openweather['longitude'] = longitude
    
    df_merged = pd.merge(df_copernicus, df_openweather, on=['time', 'latitude', 'longitude'], how='inner')
    df_merged = process_dataframe(df_merged)
    print(df_merged.info())
    print(df_merged.head())
    break

    # upload to database
    df_json = df_merged.to_json(orient='records')
    df_json = json.loads(df_json)

    db = Database(
        db_url=DB_URL,
        db_name=DB_NAME,
        collection_name=DB_COLLECTION
        )
    upload_list = []
    for item in df_json:
        item["time"] = pd.to_datetime(item["time"], unit='ms')

        db_data_all = db.get_all_data(key="time")
        if upload_article_if_new(db_data_all, item) == False:
            continue
        upload_list.append(item)

    db.upload_many(upload_list)
    db.close_connection()

print("Data uploaded to Database successfully!\n")
print("Finished!\n")



Parsing data to upload to Database...



Processing data:   0%|          | 0/49532 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 57 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   time                        0 non-null      datetime64[ns]
 1   depth                       0 non-null      float32       
 2   latitude                    0 non-null      float32       
 3   longitude                   0 non-null      float32       
 4   bottomT                     0 non-null      float32       
 5   mlotst                      0 non-null      float32       
 6   siconc                      0 non-null      float32       
 7   sithick                     0 non-null      float32       
 8   sla                         0 non-null      float32       
 9   so                          0 non-null      float32       
 10  sob                         0 non-null      float32       
 11  thetao                      0 non-null      float32       
 12  uo    




In [None]:
hourly_dataframe

Unnamed: 0,date,temperature_2m
0,2025-02-21 00:00:00+00:00,1.4455
1,2025-02-21 01:00:00+00:00,1.6955
2,2025-02-21 02:00:00+00:00,1.7955
3,2025-02-21 03:00:00+00:00,2.0955
4,2025-02-21 04:00:00+00:00,1.9455
5,2025-02-21 05:00:00+00:00,1.8455
6,2025-02-21 06:00:00+00:00,1.7955
7,2025-02-21 07:00:00+00:00,1.8955
8,2025-02-21 08:00:00+00:00,2.5955
9,2025-02-21 09:00:00+00:00,3.4455
