# Process 781 Celestial Objects (Planets + Dwarf Planets + Asteroids)

This notebook is used to process data for planets, dwarf planets, and asteroids, outputting three files:
1. Ecliptic Longitude, Ecliptic Latitude, Distance, and Range Rate
2. Geometric Position Parameters (x, y, z)
3. Velocity Parameters (vx, vy, vz)

Actual number of objects: 19 Planets/Dwarf Planets + 762 Asteroids = 781 total.
Given the large volume of data, Parquet format is used for storage to improve efficiency.

In [None]:
# Extract data for major planets and dwarf planets and save as CSV

# List of bodies
planets = ['SSB', '199', '299', '399', '499', '599', '699', '799', '899']
dwarfs = ['Ceres', '999', '136108', '136472', '136199','90482','120347','50000','225088','90377']
combined_list = planets + dwarfs

import pandas as pd
import os

# Directory for final processed data
output_dir = "../../data/ready"

# Files to read from (Planets, Dwarfs, Asteroids)
files = [
    f"{output_dir}/781_planets_dwarfs_asteroids_lonlat.parquet",
    f"{output_dir}/781_planets_dwarfs_asteroids_velocity.parquet",
    f"{output_dir}/781_planets_dwarfs_asteroids_xyz.parquet"
]

# Check if files exist before processing
if all(os.path.exists(f) for f in files):
    # Read all files and merge
    dfs = [pd.read_parquet(file) for file in files]
    df_combined = pd.concat(dfs, axis=1)

    # Extract date column
    date_col = df_combined['date'].iloc[:, 0] if 'date' in df_combined.columns and isinstance(df_combined['date'], pd.DataFrame) else df_combined['date']

    # Extract columns for selected bodies
    selected_cols = [col for col in df_combined.columns if col != 'date' and any(planet in col for planet in combined_list)]
    df_selected = df_combined[selected_cols]

    # Insert date column at the beginning
    df_selected.insert(0, 'date', date_col.values)

    # Save as CSV for quick reference
    output_csv = f"{output_dir}/major_planets_dwarfs_subset.csv"
    df_selected.to_csv(output_csv, index=False)

    print(f"Data extraction complete. Saved to {output_csv}")
    print(f"Extracted columns: {len(selected_cols)}")
else:
    print("Note: Parquet files not found. Run the processing cells below first.")

Data extraction complete. Saved to ../../data/final/major_planets_dwarfs_subset.csv
Extracted columns: 240


In [1]:
import pandas as pd
import numpy as np
import os
import glob
from pathlib import Path
import pyarrow.parquet as pq
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

Libraries imported successfully


In [2]:
# Custom date parsing function to handle "A.D. YYYY-MMM-DD HH:MM:SS.SSSS" format
def parse_special_date(date_str):
    # Remove "A.D. " prefix
    if "A.D. " in date_str:
        date_str = date_str.replace("A.D. ", "")
    # Parse the remaining part
    try:
        return pd.to_datetime(date_str, format="%Y-%b-%d %H:%M:%S.%f")
    except:
        try:
            return pd.to_datetime(date_str)
        except:
            print(f"Unable to parse date: {date_str}")
            return pd.NaT

# Standardize date format function
def standardize_date(df):
    """Standardize date format to YYYY-MM-DD"""
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
    return df

print("Functions defined successfully")

Functions defined successfully


In [3]:
# Define folder paths to traverse
# TODO: Ensure these directories exist and contain the raw CSV files from Notebook 03
base_folders = {
    'lon_lat': {
        'planets_dwarfs': '../../data/00_raw/helio_ecl_sph_00h/planets_dwarfs_daily_1849',
        'asteroids': '../../data/00_raw/helio_ecl_sph_00h/asteroids_daily_1849'
    },
    'xyz': {
        'planets_dwarfs': '../../data/00_raw/helio_cart_states_00h/planets_dwarfs_daily_1849',
        'asteroids': '../../data/00_raw/helio_cart_states_00h/asteroids_daily_1849'
    }
}

# Get star processing order from CSV
def get_star_order():
    """Get the processing order of objects from a parameter CSV file"""
    # TODO: Update this path to the actual location of your 781 objects parameter file
    csv_file = '../../data/00_raw/781_planets_dwarfs_asteroids_params.csv' 
    if os.path.exists(csv_file):
        df_params = pd.read_csv(csv_file)
        # Check for common column names (English or Chinese)
        star_col = None
        for col in ['body', 'body_id', 'star_id', 'object', 'ID']:
            if col in df_params.columns:
                star_col = col
                break
        
        if star_col:
            star_list = df_params[star_col].astype(str).tolist()
            print(f"Read {len(star_list)} objects from {csv_file} using column '{star_col}'")
            return star_list
        else:
            print(f"Warning: Could not find a valid ID column in {csv_file}")
            return None
    else:
        print(f"Warning: Parameter file {csv_file} not found. Will use folder scanning.")
        return None

# Get star processing order
star_order = get_star_order()

# Check if folders exist and count files
total_files = 0
for data_type, folders in base_folders.items():
    for category, folder_path in folders.items():
        if os.path.exists(folder_path):
            file_count = len(glob.glob(os.path.join(folder_path, '*.csv')))
            total_files += file_count
            print(f"{data_type} - {category}: {folder_path} (Files: {file_count})")
        else:
            print(f"Warning: {folder_path} does not exist")

if total_files > 0:
    print(f"\nTotal files: {total_files//2} objects (each has lonlat and XYZ files)")
if star_order:
    print(f"Will process {len(star_order)} objects in the order specified by the CSV")

Read 781 objects from ../../data/00_raw/781_planets_dwarfs_asteroids_params.csv using column 'body'
lon_lat - planets_dwarfs: ../../data/00_raw/helio_ecl_sph_00h/planets_dwarfs_daily_1849 (Files: 19)
lon_lat - asteroids: ../../data/00_raw/helio_ecl_sph_00h/asteroids_daily_1849 (Files: 762)
xyz - planets_dwarfs: ../../data/00_raw/helio_cart_states_00h/planets_dwarfs_daily_1849 (Files: 19)
xyz - asteroids: ../../data/00_raw/helio_cart_states_00h/asteroids_daily_1849 (Files: 762)

Total files: 781 objects (each has lonlat and XYZ files)
Will process 781 objects in the order specified by the CSV


In [4]:
def process_lon_lat_data():
    """Process ecliptic longitude/latitude data, outputting lon, lat, range, and range_rate"""
    print("Starting processing of lon/lat data...")

    df_list = []
    processed_count = 0

    if star_order:
        # Process objects in the order specified by the CSV
        for star_id in tqdm(star_order, desc="Processing objects (CSV order)"):
            try:
                lonlat_file_found = False
                xyz_file_found = False
                lonlat_file_path = None
                xyz_file_path = None

                # Search for lonlat file
                for category, folder_path in base_folders['lon_lat'].items():
                    if not os.path.exists(folder_path):
                        continue

                    # Try different naming conventions
                    possible_files = [
                        os.path.join(folder_path, f"{star_id}_lonlat.csv"),
                        os.path.join(folder_path, f"{star_id}_lon_lat.csv"),
                        os.path.join(folder_path, f"{star_id}.csv")
                    ]
                    
                    glob_pattern = os.path.join(folder_path, f"{star_id}_*.csv")
                    possible_files.extend(glob.glob(glob_pattern))

                    for file_path in possible_files:
                        if os.path.exists(file_path):
                            lonlat_file_path = file_path
                            lonlat_file_found = True
                            break
                    if lonlat_file_found: break

                # Search for XYZ file (needed for range/range_rate)
                for category, folder_path in base_folders['xyz'].items():
                    if not os.path.exists(folder_path):
                        continue

                    possible_files = [
                        os.path.join(folder_path, f"{star_id}_xyz.csv"),
                        os.path.join(folder_path, f"{star_id}.csv")
                    ]
                    
                    glob_pattern = os.path.join(folder_path, f"{star_id}_*.csv")
                    possible_files.extend(glob.glob(glob_pattern))

                    for file_path in possible_files:
                        if os.path.exists(file_path):
                            xyz_file_path = file_path
                            xyz_file_found = True
                            break
                    if xyz_file_found: break

                if not lonlat_file_found or not xyz_file_found:
                    continue

                # Read lonlat data
                df_lonlat = pd.read_csv(lonlat_file_path)
                if 'date' in df_lonlat.columns:
                    df_lonlat['date'] = df_lonlat['date'].apply(parse_special_date)

                lonlat_cols = ['date']
                if 'lon' in df_lonlat.columns: lonlat_cols.append('lon')
                if 'lat' in df_lonlat.columns: lonlat_cols.append('lat')
                
                for col in df_lonlat.columns:
                    if col != 'date' and (col.endswith('_lon') or col.endswith('_lat')):
                        lonlat_cols.append(col)

                df_lonlat = df_lonlat[lonlat_cols]

                # Rename columns with star_id prefix
                column_mapping = {}
                for col in df_lonlat.columns:
                    if col != 'date':
                        if col == 'lon' or col.endswith('_lon'):
                            column_mapping[col] = f"{star_id}_lon"
                        elif col == 'lat' or col.endswith('_lat'):
                            column_mapping[col] = f"{star_id}_lat"
                        else:
                            column_mapping[col] = f"{star_id}_{col}"
                df_lonlat.rename(columns=column_mapping, inplace=True)

                # Read range data from XYZ file
                df_xyz = pd.read_csv(xyz_file_path)
                if 'date' in df_xyz.columns:
                    df_xyz['date'] = df_xyz['date'].apply(parse_special_date)

                distance_data = df_xyz[['date']].copy()
                if 'range' in df_xyz.columns:
                    distance_data[f"{star_id}_range"] = df_xyz['range']
                if 'range_rate' in df_xyz.columns:
                    distance_data[f"{star_id}_range_rate"] = df_xyz['range_rate']

                # Merge lonlat and distance for this star
                df_temp = pd.merge(df_lonlat, distance_data, on='date', how='left')
                
                # Set date as index for efficient concatenation later
                df_temp.set_index('date', inplace=True)
                df_list.append(df_temp)

                processed_count += 1

            except Exception as e:
                print(f"Error processing object {star_id}: {e}")
                continue
    else:
        print("Object order not provided. Skipping folder scanning implementation for brevity.")

    if df_list:
        print(f"Concatenating {len(df_list)} objects...")
        # Use pd.concat with axis=1 for much faster merging than pd.merge in a loop
        merged_df = pd.concat(df_list, axis=1).reset_index()
        
        merged_df = standardize_date(merged_df)
        cols = ['date'] + [col for col in merged_df.columns if col != 'date']
        merged_df = merged_df[cols]

        output_file = '../../data/ready/781_planets_dwarfs_asteroids_lonlat.parquet'
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        merged_df.to_parquet(output_file, index=False, compression='snappy')

        print(f"Lon/Lat data processing complete!")
        print(f"Processed {processed_count} objects")
        print(f"Saved to: {output_file}")
        return merged_df
    return None

In [5]:
def process_xyz_position_data():
    """Process XYZ position data"""
    print("Starting processing of XYZ position data...")
    
    merged_df = None
    processed_count = 0
    
    if star_order:
        for star_id in tqdm(star_order, desc="Processing XYZ positions"):
            try:
                file_found = False
                file_path = None
                
                for category, folder_path in base_folders['xyz'].items():
                    if not os.path.exists(folder_path): continue
                    
                    possible_files = [
                        os.path.join(folder_path, f"{star_id}_xyz.csv"),
                        os.path.join(folder_path, f"{star_id}.csv")
                    ]
                    glob_pattern = os.path.join(folder_path, f"{star_id}_*.csv")
                    possible_files.extend(glob.glob(glob_pattern))
                    
                    for possible_file in possible_files:
                        if os.path.exists(possible_file):
                            file_path = possible_file
                            file_found = True
                            break
                    if file_found: break
                
                if not file_found: continue
                
                df_temp = pd.read_csv(file_path)
                if 'date' in df_temp.columns:
                    df_temp['date'] = df_temp['date'].apply(parse_special_date)
                
                position_cols = ['date']
                for c in ['x', 'y', 'z']:
                    if c in df_temp.columns: position_cols.append(c)
                
                df_temp = df_temp[position_cols]
                column_mapping = {c: f"{star_id}_{c}" for c in ['x', 'y', 'z'] if c in df_temp.columns}
                df_temp.rename(columns=column_mapping, inplace=True)
                
                if merged_df is None:
                    merged_df = df_temp
                else:
                    merged_df = pd.merge(merged_df, df_temp, on='date', how='outer')
                
                processed_count += 1
            except Exception as e:
                print(f"Error processing XYZ for {star_id}: {e}")
                continue
    
    if merged_df is not None:
        merged_df = standardize_date(merged_df)
        cols = ['date'] + [col for col in merged_df.columns if col != 'date']
        merged_df = merged_df[cols]
        
        output_file = '../../data/ready/781_planets_dwarfs_asteroids_xyz.parquet'
        merged_df.to_parquet(output_file, index=False, compression='snappy')
        print(f"XYZ position data processing complete! Saved to: {output_file}")
        return merged_df
    return None

In [6]:
def process_velocity_data():
    """Process velocity data (vx, vy, vz)"""
    print("Starting processing of velocity data...")
    
    merged_df = None
    processed_count = 0
    
    if star_order:
        for star_id in tqdm(star_order, desc="Processing velocity data"):
            try:
                file_found = False
                file_path = None
                
                for category, folder_path in base_folders['xyz'].items():
                    if not os.path.exists(folder_path): continue
                    
                    possible_files = [
                        os.path.join(folder_path, f"{star_id}_xyz.csv"),
                        os.path.join(folder_path, f"{star_id}.csv")
                    ]
                    glob_pattern = os.path.join(folder_path, f"{star_id}_*.csv")
                    possible_files.extend(glob.glob(glob_pattern))
                    
                    for possible_file in possible_files:
                        if os.path.exists(possible_file):
                            file_path = possible_file
                            file_found = True
                            break
                    if file_found: break
                
                if not file_found: continue
                
                df_temp = pd.read_csv(file_path)
                if 'date' in df_temp.columns:
                    df_temp['date'] = df_temp['date'].apply(parse_special_date)
                
                velocity_cols = ['date']
                for c in ['vx', 'vy', 'vz']:
                    if c in df_temp.columns: velocity_cols.append(c)
                
                df_temp = df_temp[velocity_cols]
                column_mapping = {c: f"{star_id}_{c}" for c in ['vx', 'vy', 'vz'] if c in df_temp.columns}
                df_temp.rename(columns=column_mapping, inplace=True)
                
                if merged_df is None:
                    merged_df = df_temp
                else:
                    merged_df = pd.merge(merged_df, df_temp, on='date', how='outer')
                
                processed_count += 1
            except Exception as e:
                print(f"Error processing velocity for {star_id}: {e}")
                continue
    
    if merged_df is not None:
        merged_df = standardize_date(merged_df)
        cols = ['date'] + [col for col in merged_df.columns if col != 'date']
        merged_df = merged_df[cols]
        
        output_file = '../../data/ready/781_planets_dwarfs_asteroids_velocity.parquet'
        merged_df.to_parquet(output_file, index=False, compression='snappy')
        print(f"Velocity data processing complete! Saved to: {output_file}")
        return merged_df
    return None

In [7]:
# Execute all data processing functions
print("Starting processing of 781 objects (Planets, Dwarfs, Asteroids) data...")
print("=" * 60)

# 1. Process Lon/Lat and Distance data
print("1. Processing Lon/Lat and Distance data")
lon_lat_df = process_lon_lat_data()
print()

# 2. Process XYZ position data  
print("2. Processing XYZ position data")
position_df = process_xyz_position_data()
print()

# 3. Process velocity data
print("3. Processing velocity data")
velocity_df = process_velocity_data()
print()

print("All data processing complete!")

Starting processing of 781 objects (Planets, Dwarfs, Asteroids) data...
1. Processing Lon/Lat and Distance data
Starting processing of lon/lat data...


Processing objects (CSV order): 100%|█████████| 781/781 [55:31<00:00,  4.27s/it]


Concatenating 781 objects...
Lon/Lat data processing complete!
Processed 781 objects
Saved to: ../../data/ready/781_planets_dwarfs_asteroids_lonlat.parquet

2. Processing XYZ position data
Starting processing of XYZ position data...


Processing XYZ positions: 100%|███████████████| 781/781 [29:29<00:00,  2.27s/it]


XYZ position data processing complete! Saved to: ../../data/ready/781_planets_dwarfs_asteroids_xyz.parquet

3. Processing velocity data
Starting processing of velocity data...


Processing velocity data: 100%|███████████████| 781/781 [29:33<00:00,  2.27s/it]


Velocity data processing complete! Saved to: ../../data/ready/781_planets_dwarfs_asteroids_velocity.parquet

All data processing complete!


In [8]:
# Data Summary and Validation
print("=" * 50)
print("Data Processing Summary")
print("=" * 50)

output_files = [
    '../../data/ready/781_planets_dwarfs_asteroids_lonlat.parquet',
    '../../data/ready/781_planets_dwarfs_asteroids_xyz.parquet', 
    '../../data/ready/781_planets_dwarfs_asteroids_velocity.parquet'
]

for file in output_files:
    if os.path.exists(file):
        file_size = os.path.getsize(file) / (1024 * 1024)  # MB
        print(f"✓ {file} - Size: {file_size:.2f} MB")
        
        try:
            df = pd.read_parquet(file)
            print(f"  Dimensions: {df.shape[0]} rows x {df.shape[1]} columns")
            print(f"  Date range: {df['date'].min()} to {df['date'].max()}")
            
            # Estimate number of objects
            non_date_cols = [col for col in df.columns if col != 'date']
            if 'lonlat' in file:
                star_count = len([col for col in non_date_cols if col.endswith('_lon')])
            elif 'xyz' in file:
                star_count = len([col for col in non_date_cols if col.endswith('_x')])
            elif 'velocity' in file:
                star_count = len([col for col in non_date_cols if col.endswith('_vx')])
            else:
                star_count = len(non_date_cols) // 3
            
            print(f"  Number of objects: {star_count}")
            print(f"  Column examples: {list(df.columns[:5])}...")
            print()
        except Exception as e:
            print(f"  Error reading file: {e}")
    else:
        print(f"✗ {file} - File not found")

print("Processing complete! All data saved in Parquet format for efficient access.")

Data Processing Summary
✓ ../../data/ready/781_planets_dwarfs_asteroids_lonlat.parquet - Size: 2228.45 MB
  Dimensions: 73780 rows x 3125 columns
  Date range: 1849-01-01 to 2051-01-01
  Number of objects: 781
  Column examples: ['date', 'SSB_lon', 'SSB_lat', 'SSB_range', 'SSB_range_rate']...

✓ ../../data/ready/781_planets_dwarfs_asteroids_xyz.parquet - Size: 1671.37 MB
  Dimensions: 73780 rows x 2344 columns
  Date range: 1849-01-01 to 2051-01-01
  Number of objects: 781
  Column examples: ['date', 'SSB_x', 'SSB_y', 'SSB_z', '199_x']...

✓ ../../data/ready/781_planets_dwarfs_asteroids_velocity.parquet - Size: 1671.38 MB
  Dimensions: 73780 rows x 2344 columns
  Date range: 1849-01-01 to 2051-01-01
  Number of objects: 781
  Column examples: ['date', 'SSB_vx', 'SSB_vy', 'SSB_vz', '199_vx']...

Processing complete! All data saved in Parquet format for efficient access.
