In [14]:
import fastapi
import pandas as pd
import numpy as np
from utils.Database import Database


DB_CONFIG = {
    "url": "localhost",
    "name": "deep-learning",
    "collection": "test"
}

db = Database(
    db_url=DB_CONFIG["url"], 
    db_name=DB_CONFIG["name"], 
    collection_name=DB_CONFIG["collection"]
    )

db_data_all = db.get_all_data(key="time")
db.close_connection()

df_db = pd.DataFrame(db_data_all).drop(columns=["_id"])



In [23]:
def process_dataframe(df: pd.DataFrame, convert_time: bool = False, drop_duplicates: bool = False, reorder: bool = False) -> pd.DataFrame:
    """Converts float columns to float32 and rounds latitude/longitude for consistency."""
    
    if drop_duplicates:
        df = df.drop_duplicates(keep='first')
        
    float_cols = df.select_dtypes(include=["float"]).columns
    df[float_cols] = df[float_cols].astype(np.float32)

    df["latitude"] = df["latitude"].astype(np.float32).round(6)
    df["longitude"] = df["longitude"].astype(np.float32).round(6)

    if convert_time and not np.issubdtype(df['time'].dtype, np.datetime64):
        df["time"] = pd.to_datetime(df["time"])

    # put time, latitude, and longitude sla columns first
    if reorder:
        cols = ["time", "latitude", "longitude", "sla"]
        df = df[cols + [col for col in df.columns if col not in cols]]

    return df

In [24]:
print(df_db.shape)
df_cleaned = process_dataframe(df_db, convert_time=True, drop_duplicates=True, reorder=True)
df_cleaned = df_cleaned.dropna(axis=1, how='all')
display(df_cleaned.info())

(1380, 57)
<class 'pandas.core.frame.DataFrame'>
Index: 690 entries, 0 to 1376
Data columns (total 36 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   time                        690 non-null    datetime64[ns]
 1   latitude                    690 non-null    float32       
 2   longitude                   690 non-null    float32       
 3   sla                         690 non-null    float32       
 4   vapour_pressure_deficit     690 non-null    float32       
 5   cloud_cover_mid             690 non-null    float32       
 6   apparent_temperature        690 non-null    float32       
 7   wind_speed_10m              690 non-null    float32       
 8   weather_code                690 non-null    float32       
 9   wind_direction_10m          690 non-null    float32       
 10  rain                        690 non-null    float32       
 11  wind_gusts_10m              690 non-null    float32

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[float_cols] = df[float_cols].astype(np.float32)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["latitude"] = df["latitude"].astype(np.float32).round(6)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["longitude"] = df["longitude"].astype(np.float32).round(6)


None