In [1]:
from utils.Database import Database
from utils.PlanetPositions import PlanetPositions
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
import json

In [2]:
# Define Absolute maximum and minimum values for date and location
ABSOLUTE_END_DATE:str = datetime.datetime.now().isoformat().split("T")[0]


START_DATE:str = "2025-01-01"
END_DATE:str = "2025-02-1"

DB_URL = 'localhost'
#DB_URL = 'host.docker.internal'
DB_NAME = 'deep-learning'
DB_COLLECTION = 'planet-data'

In [3]:

# ------------ Helper Functions ------------

def process_dataframe(df:pd.DataFrame) -> pd.DataFrame:
    for column in df.select_dtypes(include=["float"]).columns:
        df[column] = df[column].astype(np.float32)  # Konvertiere alle Float-Typen zu float32
    df['time'] = pd.to_datetime(df['time'], unit='s', format='ISO8601').dt.tz_localize(None).dt.round('H')
# df['time'] = pd.to_datetime(df['time'])

# # Runde auf die nächste Stunde
# df['time'] = df['time'].dt.round('H')
    return df

In [4]:
pp = PlanetPositions(start_date=START_DATE, stop_date=END_DATE, step='1h')
pp.fetch_data()
pp.convert_time()
df_planet = pp.get_dataframe()



In [5]:

# convert colum datetime to YYYY-MM-DD HH:MM:SS
#df_planet['datetime'] = df_planet['datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')
df_planet = df_planet.drop(columns=['datetime_str', 'planet']).rename(columns={'datetime':'time', 'targetname':'planet'})
# put time column to the first position
df_planet = df_planet[['time'] + [col for col in df_planet.columns if col != 'time']]

df_planet = process_dataframe(df_planet)
display(df_planet.head())
display(df_planet.info())

  df['time'] = pd.to_datetime(df['time'], unit='s', format='ISO8601').dt.tz_localize(None).dt.round('H')


Unnamed: 0,time,planet,x,y,z,vx,vy,vz,lighttime,range,range_rate
0,2025-01-01 01:00:00,Mercury (199),-0.20862,-1.128707,0.022359,0.022229,-0.02155,-0.002483,0.006631,1.148042,0.017099
1,2025-01-01 02:00:00,Mercury (199),-0.207692,-1.129604,0.022255,0.022291,-0.02151,-0.002487,0.006635,1.148754,0.017073
2,2025-01-01 03:00:00,Mercury (199),-0.206762,-1.130499,0.022152,0.022353,-0.021471,-0.00249,0.006639,1.149465,0.017048
3,2025-01-01 04:00:00,Mercury (199),-0.205829,-1.131393,0.022048,0.022414,-0.021431,-0.002494,0.006643,1.150175,0.017022
4,2025-01-01 05:00:00,Mercury (199),-0.204894,-1.132285,0.021944,0.022476,-0.021391,-0.002498,0.006647,1.150884,0.016996


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6705 entries, 0 to 6704
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   time        6705 non-null   datetime64[ns]
 1   planet      6705 non-null   object        
 2   x           6705 non-null   float32       
 3   y           6705 non-null   float32       
 4   z           6705 non-null   float32       
 5   vx          6705 non-null   float32       
 6   vy          6705 non-null   float32       
 7   vz          6705 non-null   float32       
 8   lighttime   6705 non-null   float32       
 9   range       6705 non-null   float32       
 10  range_rate  6705 non-null   float32       
dtypes: datetime64[ns](1), float32(9), object(1)
memory usage: 340.6+ KB


None

In [6]:
db = Database(
    db_url=DB_URL,
    db_name=DB_NAME,
    collection_name=DB_COLLECTION
    )
    

db_data_all = db.get_all_data(key="time")
db.close_connection()

if db_data_all:
    df_db = pd.DataFrame(db_data_all).drop(columns=['_id']).loc[:, ['time', 'planet']]

    df_db = process_dataframe(df_db)

    # Filtere Zeilen, die in df_db existieren
    db_tuples = set(zip(df_db["time"], df_db["planet"]))
    df_planet = df_planet[~df_planet.apply(lambda row: (row["time"], row["planet"]) in db_tuples, axis=1)]



  df['time'] = pd.to_datetime(df['time'], unit='s', format='ISO8601').dt.tz_localize(None).dt.round('H')


In [7]:
df_planet.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3356 entries, 3349 to 6704
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   time        3356 non-null   datetime64[ns]
 1   planet      3356 non-null   object        
 2   x           3356 non-null   float32       
 3   y           3356 non-null   float32       
 4   z           3356 non-null   float32       
 5   vx          3356 non-null   float32       
 6   vy          3356 non-null   float32       
 7   vz          3356 non-null   float32       
 8   lighttime   3356 non-null   float32       
 9   range       3356 non-null   float32       
 10  range_rate  3356 non-null   float32       
dtypes: datetime64[ns](1), float32(9), object(1)
memory usage: 196.6+ KB


In [8]:
df_planet['planet'].unique()

array(['Jupiter (599)', 'Saturn (699)', 'Uranus (799)', 'Neptune (899)',
       'Moon (301)'], dtype=object)

In [9]:
# Helper Function
def upload_article_if_new(db_data, not_db_data):
    # Check if the article is already in the database
    for doc in db_data:
        if (doc.get('time') == not_db_data.get('time')) and (doc.get('planet') == not_db_data.get('planet')):
            #print('Data already in the database, skipping upload...\n')
            return False
        
    return True

In [None]:
print("\nParsing data to upload to Database...\n")

# upload to database
df_json = df_planet.to_json(orient='records')
df_json = json.loads(df_json)
for item in tqdm(df_json, desc="Uploading data to Database", total=len(df_json)):
    item["time"] = pd.to_datetime(item["time"], unit='ms')


    db = Database(
        db_url=DB_URL,
        db_name=DB_NAME,
        collection_name=DB_COLLECTION
        )

    db_data_all = db.get_all_data(key="time")
    if upload_article_if_new(db_data_all, item) == False:
        continue

    db.upload_one(item)
    #print(f"Data for {item['planet']} at {item['time']} uploaded to Database successfully!\n")
    db.close_connection()

print("Data uploaded to Database successfully!\n")
print("Finished!\n")


Parsing data to upload to Database...



Uploading data to Database:  46%|████▌     | 1545/3356 [03:32<05:33,  5.43it/s]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x10a79e750>>
Traceback (most recent call last):
  File "/Users/marco/Documents/VirtualEnvironments/.main/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
Uploading data to Database:  48%|████▊     | 1613/3356 [03:45<06:41,  4.34it/s]