In [None]:
import os

os.environ["SWXSOC_MISSION"] = "padre"

import urllib
from collections import OrderedDict
from datetime import datetime, timedelta
from html.parser import HTMLParser
from pprint import pprint
from urllib.parse import urljoin

import swxsoc
from astropy.time import Time
from padre_meddea.util.util import parse_science_filename
from sunpy.extern.parse import parse
from sunpy.net import Fido
from sunpy.net import attrs as a
from sunpy.net.attr import SimpleAttr
from sunpy.net.dataretriever import GenericClient, QueryResponse

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
swxsoc.__version__

'0.2.2'

In [3]:
import sunpy

sunpy.__version__

'6.0.6'

In [4]:
from typing import List


class DataType(SimpleAttr):
    """
    Attribute for specifying the data type for the search.

    Attributes
    ----------
    value : str
        The data type value.
    """


class PADREClient(GenericClient):
    """
    Data source for searching and fetching from HTTP file servers.
    """

    baseurl = "https://umbra.nascom.nasa.gov/"

    @classmethod
    def register_values(cls):
        adict = {
            a.Provider: [("sdac", "The Solar Data Analysis Center.")],
            a.Source: [
                ("padre", "(The Solar Polarization and Directivity X-Ray Experiment)")
            ],
            a.Instrument: [
                (
                    "meddea",
                    "Measuring Directivity to Determine Electron Anisotropy (MeDDEA)",
                ),
            ],
            DataType: [
                ("spectrum", "Spectrum data from MeDDEA."),
                ("photon", "Photon data from MeDDEA."),
                ("housekeeping", "Housekeeping data from MeDDEA."),
            ],
            a.Level: [
                ("raw", "Raw Binary CCSDS Packet data"),
                ("l0", "Raw data, converted to FITS, not in physical units."),
                ("l1", "Processed data, not in physical units."),
            ],
        }
        return adict

    def search(self, *args, **kwargs) -> QueryResponse:
        """
        Query this client for a list of results.

        Parameters
        ----------
        \\*args: `tuple`
            `sunpy.net.attrs` objects representing the query.
        \\*\\*kwargs: `dict`
             Any extra keywords to refine the search.

        Returns
        -------
        A `QueryResponse` instance containing the query result.
        """
        matchdict = self._get_match_dict(*args, **kwargs)
        # Extract matchdict parameters
        instruments = matchdict.get("Instrument")
        levels = matchdict.get("Level")
        data_types = matchdict.get("DataType")
        start_time = matchdict.get("Start Time")
        end_time = matchdict.get("End Time")

        # Get search paths with data_type
        search_paths = self._get_search_paths(
            instruments, levels, data_types, start_time, end_time
        )
        swxsoc.log.debug(f"Search paths: {search_paths}")

        # Search each path
        all_files = []
        for path in search_paths:
            url = urljoin(self.baseurl, path)
            swxsoc.log.debug(f"Searching HTTP directory: {url}")
            files = self._crawl_directory(url)
            all_files.extend(files)

        # Template Replacement for DataType
        shortname_to_datatype = {
            "A0": "photon",
            "A2": "spectrum",
            "U8": "housekeeping",
        }

        # Process and return results
        metalist = []
        for file_url in all_files:
            swxsoc.log.debug(f"Processing file URL: {file_url}")
            info = parse_science_filename(file_url)

            # Fix the DataType Information from the Raw file and filter Raw Files with wrong DataType
            if info.get("level") == "raw":
                for shortname, longname in shortname_to_datatype.items():
                    if shortname in file_url:
                        info["descriptor"] = longname
                if info["descriptor"] not in data_types:
                    continue  # Skip files with wrong DataType

            rowdict = OrderedDict()
            rowdict["Instrument"] = info.get("instrument", "unknown")
            rowdict["Mode"] = info.get("mode", "unknown")
            rowdict["Test"] = info.get("test", False)
            rowdict["Time"] = info.get("time", "unknown")
            rowdict["Level"] = info.get("level", "unknown")
            rowdict["Version"] = info.get("version", "unknown")
            rowdict["Descriptor"] = info.get("descriptor", "unknown")
            rowdict["url"] = file_url  # Key
            metalist.append(rowdict)

        # pprint(f"Final metalist: {metalist}")
        return QueryResponse(metalist, client=self)

    def _get_search_paths(
        self,
        instruments: List[str] = None,
        levels: List[str] = None,
        data_types: List[str] = None,
        start_time: Time = None,
        end_time: Time = None,
    ):
        """Generate HTTP paths to search based on query parameters."""
        paths = []

        # Mission Name
        mission = "padre"

        time_paths = self._generate_time_paths(start_time, end_time)
        # Combine all path components
        for instrument in instruments:
            for level in levels:
                if level == "raw":
                    for time_path in time_paths:
                        # For raw data, do not include data type in the path
                        paths.append(
                            f"{mission}/{mission}-{instrument}/{level}/{time_path}/"
                        )
                else:
                    # For other levels, include data type in the path
                    for data_type in data_types:
                        for time_path in time_paths:
                            # For other levels, include data type in the path
                            paths.append(
                                f"{mission}/{mission}-{instrument}/{level}/{data_type}/{time_path}/"
                            )
        return paths

    def _generate_time_paths(self, start_time: Time, end_time: Time):
        """
        Generate all year/month/day path components between start_time and end_time.

        Parameters
        ----------
        start_time : astropy.time.Time
            Start time in ISO format (e.g., '2025-05-04')
        end_time : astropy.time.Time
            End time in ISO format (e.g., '2025-07-07')

        Returns
        -------
        list
            List of path strings in format 'YYYY/MM/DD'
        """
        # Parse the ISO format times
        start_date = start_time.datetime
        end_date = end_time.datetime

        # Initialize empty list for paths
        time_paths = []

        # Iterate through each day in the range
        current_date = start_date
        while current_date <= end_date:
            # Format as YYYY/MM/DD
            path = (
                f"{current_date.year}/{current_date.month:02d}/{current_date.day:02d}"
            )
            time_paths.append(path)

            # Move to next day
            current_date += timedelta(days=1)

        swxsoc.log.debug(
            f"Generated {len(time_paths)} time paths from {start_time} to {end_time}"
        )
        return time_paths

    def _crawl_directory(self, url):
        """Directory crawler using only standard library."""

        class LinkParser(HTMLParser):
            def __init__(self):
                super().__init__()
                self.links = []

            def handle_starttag(self, tag, attrs):
                if tag == "a":
                    for attr, value in attrs:
                        if attr == "href":
                            self.links.append(value)

        files = []
        try:
            with urllib.request.urlopen(url) as response:
                html = response.read().decode("utf-8")

            parser = LinkParser()
            parser.feed(html)

            for href in parser.links:
                # Skip parent directory links and query parameters
                if not href or href.startswith("?") or href == "../":
                    continue

                full_url = urljoin(url, href)

                # Don't crawl up: make sure we're still below our starting point
                if not full_url.startswith(self.baseurl) or len(full_url) < len(
                    self.baseurl
                ):
                    continue

                elif href.lower().endswith(".fits") or href.lower().endswith(".dat"):
                    files.append(full_url)

            return files
        except Exception as e:
            swxsoc.log.debug(f"Error processing {url}: {e}")
            return []

In [5]:
Fido

Client,Description
CDAWEBClient,Provides access to query and download from the Coordinated Data Analysis Web (CDAWeb).
ADAPTClient,Provides access to the ADvanced Adaptive Prediction Technique (ADAPT) products of the National Solar Observatory (NSO).
EVEClient,Provides access to Level 0CS Extreme ultraviolet Variability Experiment (EVE) data.
GBMClient,Provides access to data from the Gamma-Ray Burst Monitor (GBM) instrument on board the Fermi satellite.
XRSClient,Provides access to several GOES XRS files archive.
SUVIClient,Provides access to data from the GOES Solar Ultraviolet Imager (SUVI).
GONGClient,Provides access to the Magnetogram products of NSO-GONG synoptic Maps.
LYRAClient,Provides access to the LYRA/Proba2 data archive.
NOAAIndicesClient,Provides access to the NOAA solar cycle indices.
NOAAPredictClient,Provides access to the NOAA SWPC predicted sunspot Number and 10.7 cm radio flux values.


## Test Getting Raw, Level 0, and Level 1 Data All Together

In [6]:
results = Fido.search(
    a.Time("2025-05-01", "2025-05-05") & a.Instrument.meddea & DataType.spectrum
)
results

Instrument,Mode,Test,Time,Level,Version,Descriptor
str6,object,bool,Time,str3,object,str8
meddea,,False,2025-05-04 07:04:26,raw,,spectrum
meddea,,False,2025-05-04 08:15:36,raw,,spectrum
meddea,,False,2025-05-04 10:38:26,raw,,spectrum
meddea,,False,2025-05-04 11:49:36,raw,,spectrum
meddea,,False,2025-05-04 13:00:56,raw,,spectrum
meddea,,False,2025-05-04 14:12:26,raw,,spectrum
meddea,,False,2025-05-04 15:23:46,raw,,spectrum
meddea,,False,2025-05-04 15:31:20,raw,,spectrum
meddea,,True,2025-05-04 07:04:11,l0,0.1.0,spectrum
meddea,,True,2025-05-04 08:15:21,l0,0.1.0,spectrum


## Test Getting All Level 1 Data Across All Data Types

In [7]:
results = Fido.search(
    a.Time("2025-05-01", "2025-05-05") & a.Instrument.meddea & a.Level.l1
)
results

Instrument,Mode,Test,Time,Level,Version,Descriptor
str6,object,bool,Time,str2,str5,str12
meddea,,False,2025-05-04T00:00:00.000,l1,0.1.0,spectrum
meddea,,False,2025-05-04T00:00:00.000,l1,0.1.0,housekeeping


## Test Getting L1 Photon Data (Does not exist yet)

In [8]:
results = Fido.search(
    a.Time("2025-05-01", "2025-05-05")
    & a.Instrument.meddea
    & a.Level.l1
    & DataType.photon
)
results

## Test Getting all RAW Data

In [9]:
results = Fido.search(
    a.Time("2025-05-01", "2025-05-05") & a.Instrument.meddea & a.Level.raw
)
results

Instrument,Mode,Test,Time,Level,Version,Descriptor
str6,object,bool,Time,str3,object,str12
meddea,,False,2025-05-04 05:51:33,raw,,photon
meddea,,False,2025-05-04 07:37:54,raw,,photon
meddea,,False,2025-05-04 08:03:35,raw,,photon
meddea,,False,2025-05-04 08:18:36,raw,,photon
meddea,,False,2025-05-04 08:32:39,raw,,photon
meddea,,False,2025-05-04 08:46:10,raw,,photon
meddea,,False,2025-05-04 08:59:36,raw,,photon
meddea,,False,2025-05-04 09:39:11,raw,,photon
meddea,,False,2025-05-04 09:51:22,raw,,photon
...,...,...,...,...,...,...


## Test Getting all RAW Housekeeping Data

In [10]:
results = Fido.search(
    a.Time("2025-05-01", "2025-05-05")
    & a.Instrument.meddea
    & a.Level.raw
    & DataType.housekeeping
)
results

Instrument,Mode,Test,Time,Level,Version,Descriptor
str6,object,bool,Time,str3,object,str12
meddea,,False,2025-05-04 05:51:34,raw,,housekeeping
meddea,,False,2025-05-04 15:31:21,raw,,housekeeping


In [11]:
import tempfile

with tempfile.TemporaryDirectory() as temp_dir:
    downloaded_files = Fido.fetch(results, path=temp_dir)
downloaded_files

Files Downloaded:   0%|          | 0/2 [00:00<?, ?file/s]
[A
Files Downloaded: 100%|██████████| 2/2 [00:00<00:00,  7.44file/s]


<parfive.results.Results object at 0x1163d5030>
['/var/folders/5l/_5r0pdg15fxg1_rkgmd3c1dm0000gn/T/tmph24n2_d7/PADREMDU8_250504055134.DAT', '/var/folders/5l/_5r0pdg15fxg1_rkgmd3c1dm0000gn/T/tmph24n2_d7/PADREMDU8_250504153121.DAT']