# Generate Asteroid CSV for Calibration Level 2 Data

This code streamlines the process of identifying new asteroid detections in the MIRI database by focusing exclusively on observations that have not yet been processed. It compares the observations in the current MIRI database with those recorded in the previously generated 'Level3_Asteroid_Search.csv'. Any new observations that were not included in the initial search are processed using the Level 3 detection pipeline. The results of this incremental search are then appended to an updated CSV, creating an updated and comprehensive dataset of asteroid detections. This approach avoids reprocessing already-searched data, significantly reducing runtime and improving efficiency.

## Import Libraries

In [1]:
import time
import requests
import json
import re
import os
import sys
import logging
import glob

import shapely.wkt
from shapely.geometry import Point

from PIL import Image
from sbident import SBIdent

import numpy as np
import math as mt
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Circle

from astropy.io import fits
from astropy.wcs import WCS
from astropy.coordinates import SkyCoord
from astropy.time import Time
import astropy.units as u
from astropy.visualization import simple_norm
from datetime import datetime, timedelta

from astroquery.jplhorizons import Horizons
from astroquery.esa.jwst import Jwst

from scipy.ndimage import label

from tqdm.notebook import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

## Useful Functions

In [2]:
def MJDconversion(modifiedJulianDate):
    # Convert string from Modified Julian Date to YYYMMDD [HH:MM:SS.SS] format
    return (Time(modifiedJulianDate, format='mjd').iso)

In [4]:
def combine_strings(string_list):
    # Convert a list of strings into a single string comma seperated
    return ', '.join(string_list)

In [7]:
def members_string(members_init_string):
    # Convert 'members' string from the archive query into a more presentable fashion of level 3 CSV
    members_string = members_init_string.replace('caom:JWST/', '').replace(' ',', ')
    return (members_string)

In [3]:
def checkDataExists(proposal, observation):
    # Check if the observation is on Datalabs
    jwst_file = f"jw0{next(c for c in proposal if c != '0')}"
    dataPath = f'/data/user/jwst_{jwst_file}/jw0{proposal}/{observation}_i2d.fits.gz'    
    return (os.path.exists(dataPath))        

In [6]:
class HiddenPrints:
    # Does not produce print outputs, used for built in functions with noisy print statmeents
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

In [10]:
def combine_csv_files(CSV1, CSV2, folder_path):        
    # List to store dataframes
    added_CSV = [CSV1, CSV2]
    
    # Concatenate all dataframes
    combined_df = pd.concat(added_CSV, ignore_index=True)
    combined_df = combined_df.sort_values('Observation', ascending=True)
    
    # Save combined dataframe to output CSV
    combined_df.to_csv(f"{folder_path}/Updated_Level3.csv", index=False)

In [9]:
def formatPolygon(polyString):
    # Format the archive polygon string to a format that is compatible with the shapely function
    #slice away the polygon charactors 'polygon((' from the start and '))' from the end
    coords = polyString[8:-2].split(' ')
    
    #Add in the fist location at the end to close the 4 point region, shapely expects 5 coordinates
    coords.append(coords[0])
    coords.append(coords[1])
    
    return f"POLYGON (({', '.join([coords[i] + ' ' + coords[i+1] for i in range(0, len(coords), 2)])}))"

## Archive Query Functions

In [11]:
def queryArchive(volume, readouts, query_filters):
    # Generate an adql query search for the JWST archive to filter the observations and produce a pandas DF containing all useful information
    
    # Set up constraints and filters for data selected for the archive to return
    query_string = f"SELECT {','.join(readouts)} FROM jwst.{volume} WHERE {' AND '.join(query_filters)}"
    
    # Run job and convert archive results to a pandas dataframe 
    job = Jwst.launch_job(query_string, async_job=True)
    panda_result = job.get_results().to_pandas()
        
    #Sort the dataframe by the proposal ID
    return panda_result.sort_values(by=['observationid']).reset_index(drop=True)

In [12]:
def level3_Archive_Query(propRange = [1000,9000], instrumentName = 'MIRI/IMAGE', dataType = 'image', volume = 'archive', additionalFilters = []):
    """
    --- Filter Search Parameters --- 
    propRange = list containing the starting and ending proposal range to check
    instrumentName = instrument used for observation
    datatype =  observation data type
    volume = volume used for database search
    additionalFilters = list of strings containing additional filters to be used in the archive filtering
    """
    
    lowerbound_proposal, upperbound_proposal = propRange
    
    calLVL = 3
    
    # Define query filters
    query_filters = [
            f'jwst.{volume}.calibrationlevel = {calLVL}',
            f"jwst.{volume}.dataproducttype = '{dataType}'",
            f"jwst.{volume}.instrument_name = '{instrumentName}'",
            f"jwst.{volume}.proposal_id >= '{lowerbound_proposal}'",
            f"jwst.{volume}.proposal_id <= '{upperbound_proposal}'",
        ] + additionalFilters  # Append additional filters if provided
    
    # Define which archive outputs of interest
    query_topics = ['proposal_id',  'observationid', 'dataproducttype', 'intent',  'instrument_name',  'energy_bandpassname',
                   'target_moving','position_bounds_spoly','time_bounds_lower','time_bounds_upper','members']
    
    # Run the search, return a dataframe with results
    with HiddenPrints():
        level3_ArchiveDF = queryArchive(volume, query_topics, query_filters)
    
    return (level3_ArchiveDF)

## Small Body Identification Cone Search

In [5]:
def definePolyEdges(polyString):
    # Define the image boundaries for the cone search
    # NOTE: the cone search takes RA and DEC bounds so image bounds are aligned with the DEC and RA axis (using the min,max bounds)
    
    # Deconstruct the polygon string into its RA and DEC coordinates 
    coordinates = polyString.replace("POLYGON ((", "").replace("))", "").replace(", ", " ").split()
    coordinates = list(map(float, coordinates))
    
    RA_elements = coordinates[::2]
    DEC_elements = coordinates[1::2]
    
    # Add in a small buffer around the image 
    buffer = 0.005  #Deg

    # Identify the max/min boundary range of the image by identifying 2 corners of the image
    low_right_corner = SkyCoord(min(RA_elements) - buffer, min(DEC_elements) - buffer, frame='icrs', unit='deg')
    up_left_corner   = SkyCoord(max(RA_elements) + buffer, max(DEC_elements) + buffer, frame='icrs', unit='deg')
    
    return([low_right_corner, up_left_corner])   

In [8]:
def expProbeTime(expStartMJD, expEndMJD):
    # For longer exposure times, define probe times to cone search for streaking asteroids
    probe_time = (1/3)/24 #20 minutes
    
    # Check how many 20 minute segments are in the exposure time
    probe = expStartMJD  + probe_time
    
    if (expEndMJD - expStartMJD) > probe:
        probe_list = [expStartMJD]
        
        while probe < expEndMJD:
            probe_list.append(MJDconversion(probe + probe_time))
            probe += probe_time
        
    else:
        # Choose the center of the exposure time
        probe_list = [MJDconversion(expStartMJD + (expEndMJD - expStartMJD)/2)]
        
    # Return a list containing all of the 20 minute times during the exposure time to be cone searched over
    return (probe_list)

In [13]:
def JWSTposition(obsTime):
    # Determine the position of JWST during the observation time from an Earth perspective
    # Follows example 3 from https://github.com/bengebre/sbident/blob/main/examples/sbident-examples.ipynb 
    
    #NOTE: it is likely that this value can be pulled from the image header in the future
    
    # Generate AU to km conversion
    au_to_km = (1 * u.au).to(u.km).value
    
    # Probe for the JWST output from JPL Horizons, state vector
    jwst_output = Horizons(id='JWST',location='Geocentric',epochs=obsTime.jd, id_type='id').vectors(refplane='earth')

    # Convert position and velocity from AU to km and km/s respectively
    jwst_output_km = jwst_output[['x', 'y', 'z', 'vx', 'vy', 'vz']].to_pandas().to_numpy()
    jwst_output_km[:, :3] *= au_to_km  # Convert position (x, y, z) from AU to km
    jwst_output_km[:, 3:] /= 86400     # Convert velocity (vx, vy, vz) from AU/day to km/s

    # Form the xobs dictionary that is the input for SBIdent location argument
    xobs = ','.join([f"{s:.12e}" for s in jwst_output_km[0]])
    return {'xobs': xobs}

In [14]:
def coneSearch(Exptime, Edge1, Edge2):
    # Apply cone search method to identify what asteroids are present in the observation at a specific time (exp time) bound by the image corners (ra, dec)
    # NOTE: unlike JPL Horizons, the sbident cone search only utilizes 1 fixed time
    
    # Convert the exposure time string into the observation time to probe the cone search
    ObsTime = Time(Exptime)
    
    # Determine the JWST position at the moment of observation
    jwstLocation = JWSTposition(ObsTime)

    # Apply the small body identification cone search method 'sbid' from https://github.com/bengebre/sbident
    try:
        sbid = SBIdent(jwstLocation, ObsTime, [Edge1, Edge2]).results
        
    except Exception as e:
        # Some times the connection gets interrupted and needs to be reran
        logging.info("Failed First Try in SBIDENT")
        logging.info(e)
        time.sleep(5)
        
        # Try again
        try:
            sbid = SBIdent(jwstLocation, ObsTime, [Edge1, Edge2]).results
        except Exception as e:
            logging.info("Failed Second Try in SBIDENT")
            logging.info(e)
            sbid = False
    
    # If the return sbid is an empty list convert it to a False boolean
    if isinstance(sbid, list) and not sbid:
        logging.info("SBIDENT Output was empty")
        sbid = False
    
    return(sbid)

## Level 3 Asteroid Detection

In [15]:
def level3_asteroid_names(expStartMJD, expEndMJD, polyStringFormatted):
    # Return all the asteroid names within the level 3 image conesearch
    
    # Define the image bounds from the polygon
    poly_corners = definePolyEdges(polyStringFormatted)
    
    # Define the probe time for individual searches
    probeList = expProbeTime(expStartMJD, expEndMJD)
    
    asteroid_names = []
    
    # Loop over the probe times within an image and return all asteroid names from the conesearch
    for probe in probeList:
        sbid_middle_results = coneSearch(probe, *poly_corners)
        
        if sbid_middle_results:
            asteroid_names.append(sbid_middle_results['Object name']) 
        else:
            # No asteroids found in the cone search
            pass
    
    # Find only uniqely contained Asteroids
    unique_asteroids = list(set(item.split('(')[-1].replace(')', '') for sublist in asteroid_names for item in sublist))

    if len(unique_asteroids) > 0:
        return(', '.join([f"{ast}" for ast in unique_asteroids]))

    else:
        # No Asteroids found
        return('')   

In [16]:
def level3_asteroid_search(row):
    #Begin the asteroid search process 
    
    proposal = row['proposal_id']
    observation = row['observationid']
    polygonString = row['position_bounds_spoly']
    expStartMJD = row['time_bounds_lower']
    expEndMJD = row['time_bounds_upper']

    #convert the format of the polygon string
    poly_string_formatted = formatPolygon(str(polygonString))
    
    #perform the initial Cone Search
    sbident_asteroids = level3_asteroid_names(expStartMJD, expEndMJD, poly_string_formatted)
        
    return(sbident_asteroids)

## Main Function

In [22]:
def main(propRange, instrumentName, dataType, volume, additionalFilters=[], level3_CSV_Input_Path = ''):
    # Main function to run the JWST Known-Asteroid Detection
    if '/' in level3_CSV_Input_Path:
        folder_path = os.path.dirname(level3_CSV_Input_Path)
    else:
        folder_path = './'
    current_time = datetime.now()
    timestamp = current_time.strftime("%Y%m%d_%H%M%S")
    log_filename = f"{folder_path}/LVL3_Update_log_{timestamp}.txt"

    # Configure logging to write to a file
    logging.basicConfig(
        filename=log_filename,  # Log file
        level=logging.INFO,     # Logging level
        force=True,             #prevent overwriting data
        format="%(message)s"    # Log message format
    )

    # Log messages (these will go to the file only)
    logging.info(f"This run was performed at {timestamp}\n")
    
    ##### ----- LEVEL 3 ----- #####
    
    
    # If the Level 3 CSV has already been generated then dont rerun it
    if level3_CSV_Input_Path == '':
        print('Missing previously generated Level 3 CSV')
        
    else:
        start_time = time.time()
        logging.info('Performing Level 3 Search')
        
        # read in previous Level 3 CSV
        Level_3_Archive_CSV = pd.read_csv(level3_CSV_Input_Path)
        
        # Extract the 'Observation' column as a list
        observation_list = Level_3_Archive_CSV['Observation'].tolist()

        # Perform an archive search for level 3 observations meeting the qualifications defined in the main function
        Level_3_Archive_DF = level3_Archive_Query(propRange, instrumentName, dataType, volume, additionalFilters)
        
        # Only looking at observations not previously scanned 
        Filtered_Level_3_Archive_DF = Level_3_Archive_DF[~Level_3_Archive_DF['observationid'].isin(observation_list)]
        
        if Filtered_Level_3_Archive_DF.empty:
            print(f"Archive does not contain more observations then previously scanned, in range {propRange}")
            logging.info(f'Archive does not contain more observations then previously scanned, in range {propRange}')
            return
        
        # Itterate through the level 3 archive data and check images for asteroids
        Filtered_Level_3_Archive_DF['Asteroids'] = Filtered_Level_3_Archive_DF.progress_apply(lambda row: pd.Series(level3_asteroid_search(row)), axis=1)

        # Prepare Data Frame for CSV presentation
        Level_3_Archive_CSV_new = (
            Filtered_Level_3_Archive_DF.rename(columns={
                'proposal_id': 'Proposal', 
                'observationid': 'Observation', 
                'dataproducttype': 'Data Type',
                'intent': 'Intent',
                'instrument_name': 'Instrument',
                'energy_bandpassname': 'Filter',
                'target_moving': 'Moving', 
                'position_bounds_spoly': 'Polygon Boundary', 
                'time_bounds_lower': 'Exposure Start', 
                'time_bounds_upper': 'Exposure End',
                'members': 'Level 2 Members'
            })

            .assign(**{
                'Exposure Start': lambda df: df['Exposure Start'].apply(MJDconversion),                                     # Convert from MJD to 'yyyy-mm-dd HH:MM:SS'
                'Exposure End': lambda df: df['Exposure End'].apply(MJDconversion),                                         # Convert from MJD to 'yyyy-mm-dd HH:MM:SS'
                'Moving': lambda df: df['Moving'].replace({0: 'No', 1: 'Yes'}),                                             # Adds Yes/No if the observation is 'moving'
                'Datalabs': lambda df: df.apply(lambda row: checkDataExists(row['Proposal'], row['Observation']),axis=1),   # Add 'data_exists' column tracking if the img is on Datalabs
                'Level 2 Members': lambda df: df['Level 2 Members'].apply(members_string)})                                 # Configure the 'members' format
            .sort_values(by='Observation', ascending=True)                                                                  # Sort by Observation ID
            .reset_index(drop=True))                                                                                        # Reset the index

        # Save CSV
        Level_3_Archive_CSV_new.to_csv(f"{folder_path}/Level3_New_Obs.csv",index=False)

        # Reduce dataframe to only observations with asteroids, save CSV
        Level_3_Archive_CSV_Only_Asteroids = Level_3_Archive_CSV_new[Level_3_Archive_CSV_new['Asteroids'].str.contains(r'[a-zA-Z]', na=False)].copy()
        Level_3_Archive_CSV_Only_Asteroids.to_csv(f"{folder_path}/Level3_New_Obs_Asteroids.csv",index=False)
        
        logging.info(f'Finished the Level 3 Search:')
        logging.info(f'From {len(Level_3_Archive_CSV_new)} Searched Observations')
        logging.info(f'{len(Level_3_Archive_CSV_Only_Asteroids)} Observations contained atleast 1 Asteroid')
        logging.info(f'This took {round((time.time()- start_time)/60)} minutes\n')
        
        logging.info(f'Updated the new CSV, saved to {folder_path}/Level3_Updated.csv\n')
        combine_csv_files(Level_3_Archive_CSV, Level_3_Archive_CSV_new, folder_path)
    

## Run The Pipeline

In [23]:
%%time

# Constants and parameters
starting_proposal = 1000
ending_proposal = 7000

instrument = 'MIRI/IMAGE'
data_type = 'image'
volume = 'archive'

proposal_range = [starting_proposal,ending_proposal]

main(proposal_range, instrument, data_type, volume, level3_CSV_Input_Path = 'Results/Level3_Asteroid_Search_Full.csv')

#Kernel keeps restarting at 1177, 1161
# check the uncertainty in position for far and close (1727 streak accidentally picks up a different source)2015 XK95

0        jw01022-o017_t001_miri_f770w
1        jw01022-o018_t001_miri_f770w
2        jw01022-o019_t001_miri_f770w
3        jw01022-o020_t001_miri_f770w
4        jw01022-o021_t001_miri_f770w
                    ...              
6008     jw06811-o001_t001_miri_f770w
6009    jw06838-o004_t001_miri_f1280w
6010    jw06838-o004_t001_miri_f1500w
6011    jw06838-o004_t001_miri_f1800w
6012    jw06838-o004_t001_miri_f2100w
Name: observationid, Length: 6013, dtype: object


  0%|          | 0/14 [00:00<?, ?it/s]

CPU times: user 1.13 s, sys: 138 ms, total: 1.27 s
Wall time: 36min 33s
