In [1]:
import os
from pathlib import Path
from pprint import pprint
from urllib.parse import urljoin

import numpy as np
import padre_meddea
import pandas as pd
import requests
from astropy.io import fits
from astropy.time import Time
from astropy.table import Table
from bs4 import BeautifulSoup
from padre_meddea.calibration.calibration import process_file
from padre_meddea.io.fits_tools import (  # New Stuff
    CUSTOM_ATTRS_PATH,
)

from solarnet_metadata.schema import SOLARNETSchema
from solarnet_metadata.validation import validate_file, validate_header
from tqdm.notebook import tqdm

In [2]:
# TODO: UPDATE THIS WITH YOUR OWN LOCAL PATH IF YOU WANT
base_path = Path("/Users/andrewrobbertz/__SOC_CODE__/_data_/PADRE/MEDDEA")
experiment_path = base_path / "L1"

if not experiment_path.exists():
    os.makedirs(experiment_path)

## Download the Pipeline-Generated L1 Files

In [3]:
def download_from_url(base_url, experiment_path, recurse=True, file_extension='.fits'):
    """
    Recursively download files from a URL, preserving directory structure.
    
    Args:
        base_url (str): The URL to download files from
        experiment_path (Path): The local path to save files to
        recurse (bool): Whether to recursively download from subdirectories
        file_extension (str): File extension to filter by (e.g., '.fits')
    """
    try:
        print(f"Accessing {base_url}")
        response = requests.get(base_url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        
        parent_folder = BeautifulSoup(response.text, 'html.parser')
        
        # Create the local directory if it doesn't exist
        os.makedirs(experiment_path, exist_ok=True)
        
        for link in parent_folder.find_all('a'):
            href = link.get('href')
            
            # Skip parent directory links and query parameters
            if not href or href.startswith('?') or href.startswith('/') or href == '../':
                continue
                
            # Create full URL for the link
            full_url = urljoin(base_url, href)
            
            # If it's a directory and we're recursing
            if href.endswith('/') and recurse:
                # Remove trailing slash for local directory name
                dir_name = href[:-1]
                local_dir = experiment_path / dir_name
                
                # Recursively download from this directory
                download_from_url(full_url, local_dir, recurse, file_extension)
            
            # If it's a file with the desired extension
            elif href.lower().endswith(file_extension.lower()):
                print(f"Downloading {full_url}")
                local_path = experiment_path / href
                
                # Create parent directories if they don't exist
                os.makedirs(local_path.parent, exist_ok=True)
                
                # Download the file
                with open(local_path, 'wb') as file:
                    file_response = requests.get(full_url)
                    file_response.raise_for_status()
                    file.write(file_response.content)
                    
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {base_url}: {e}")
    except Exception as e:
        print(f"Error processing {base_url}: {e}")

In [4]:
### L1 Files
base_url = 'https://umbra.nascom.nasa.gov/padre/padre-meddea/l1/'
download_from_url(base_url, experiment_path, recurse=True, file_extension='.fits')

Accessing https://umbra.nascom.nasa.gov/padre/padre-meddea/l1/
Accessing https://umbra.nascom.nasa.gov/padre/padre-meddea/l1/housekeeping/
Accessing https://umbra.nascom.nasa.gov/padre/padre-meddea/l1/housekeeping/2025/
Accessing https://umbra.nascom.nasa.gov/padre/padre-meddea/l1/housekeeping/2025/05/
Accessing https://umbra.nascom.nasa.gov/padre/padre-meddea/l1/housekeeping/2025/05/04/
Downloading https://umbra.nascom.nasa.gov/padre/padre-meddea/l1/housekeeping/2025/05/04/padre_meddea_l1_housekeeping_20250504T000000_v0.1.0.fits
Accessing https://umbra.nascom.nasa.gov/padre/padre-meddea/l1/spectrum/
Accessing https://umbra.nascom.nasa.gov/padre/padre-meddea/l1/spectrum/2025/
Accessing https://umbra.nascom.nasa.gov/padre/padre-meddea/l1/spectrum/2025/05/
Accessing https://umbra.nascom.nasa.gov/padre/padre-meddea/l1/spectrum/2025/05/04/
Downloading https://umbra.nascom.nasa.gov/padre/padre-meddea/l1/spectrum/2025/05/04/padre_meddea_l1_spectrum_20250504T000000_v0.1.0.fits


In [5]:
processed_files = list(experiment_path.rglob('*.fits'))
print(f"Found {len(processed_files)} files in {experiment_path}")

for file in processed_files:
    print(f" - {file}")

Found 2 files in /Users/andrewrobbertz/__SOC_CODE__/_data_/PADRE/MEDDEA/L1
 - /Users/andrewrobbertz/__SOC_CODE__/_data_/PADRE/MEDDEA/L1/housekeeping/2025/05/04/padre_meddea_l1_housekeeping_20250504T000000_v0.1.0.fits
 - /Users/andrewrobbertz/__SOC_CODE__/_data_/PADRE/MEDDEA/L1/spectrum/2025/05/04/padre_meddea_l1_spectrum_20250504T000000_v0.1.0.fits


## Check for SOLARNET Compliance in the L1 Data

In [6]:
# Create Custome PADRE SOLARNET schema
padre_schema = SOLARNETSchema(schema_layers=[CUSTOM_ATTRS_PATH])

files = []
all_findings = []
for processed_file in processed_files:
    # Validate the first Processed File against the SOALRNET schema
    file_findings = validate_file(
        file_path=processed_file,
        warn_empty_keyword=True,
        warn_no_comment=False,
        warn_data_type=True,
        schema=padre_schema,
    )
    all_findings.extend(file_findings)
    files.extend([processed_file.name] * len(file_findings))



In [7]:
df = pd.DataFrame([files, all_findings]).T
df.columns = ["file", "findings"]

# Group by findings and get unique filenames for each finding
findings_summary = df.groupby('findings')['file'].unique().reset_index()

# Optionally, add a count of files for each finding
findings_summary['file_count'] = findings_summary['file'].apply(len)

# Sort by most common findings first
findings_summary = findings_summary.sort_values('file_count', ascending=True).reset_index(drop=True)

In [8]:
pprint(findings_summary["findings"].values)

array(["Observation Header 2: Keyword 'TNULL4' not found in the schema. Cannot Validate Data Type.",
       "Observation Header 2: Keyword 'TNULL6' not found in the schema. Cannot Validate Data Type.",
       "Observation Header 2: Keyword 'TNULL5' not found in the schema. Cannot Validate Data Type.",
       "Primary Header: FITS card for 'PARENTXT' exceeds 80 characters (length: 509).",
       "Observation Header 2: Keyword 'TNULL3' not found in the schema. Cannot Validate Data Type.",
       "Observation Header 2: Keyword 'TNULL2' not found in the schema. Cannot Validate Data Type.",
       "Observation Header 2: Keyword 'JDREF' not found in the schema. Cannot Validate Data Type.",
       "Observation Header 2: Keyword 'TREFPOS' not found in the schema. Cannot Validate Data Type.",
       "Observation Header 2: Keyword 'TUNIT1' not found in the schema. Cannot Validate Data Type.",
       "Observation Header 1: Keyword 'TFIELDS' not found in the schema. Cannot Validate Data Type.",
  

## Explore L1 Concat Files

### Housekeeping Files

In [9]:
processed_files[0].name

'padre_meddea_l1_housekeeping_20250504T000000_v0.1.0.fits'

In [10]:
with fits.open(processed_files[0]) as hdul:
    info = hdul.info()
    p_hdr = hdul[0].header
    # Create Astripy Table from the BinTable HDU with the provenance information
    # Assuming the provenance information is in the 4th HDU
    provenance = Table(hdul[3].data)
pprint(info)
pprint(p_hdr)

Filename: /Users/andrewrobbertz/__SOC_CODE__/_data_/PADRE/MEDDEA/L1/housekeeping/2025/05/04/padre_meddea_l1_housekeeping_20250504T000000_v0.1.0.fits
No.    Name      Ver    Type      Cards   Dimensions   Format
  0  PRIMARY       1 PrimaryHDU      38   ()      
  1  HK            1 BinTableHDU     81   3496R x 16C   [J, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I]   
  2  READ          1 BinTableHDU     59   1R x 6C   [2D, J, J, I, I, I]   
  3  PROVENANCE    1 BinTableHDU     19   2R x 3C   [60A, 23A, 23A]   
None
SIMPLE  =                    T / conforms to FITS standard                      
BITPIX  =                    8 / array data type                                
NAXIS   =                    0 / number of array dimensions                     
EXTEND  =                    T                                                  
DATE    = '2025-06-23T16:11:30.506' / File creation date in UTC                 
AUTHOR  = 'Steven D. Christe'  / Who designed the observation             

In [12]:
provenance

FILENAME,DATE_BEG,DATE_END
str60,str23,str23
padre_meddea_l0test_housekeeping_20250504T055138_v0.1.0.fits,2025-05-04T05:51:38.000,2025-05-04T05:51:38.000
padre_meddea_l0test_housekeeping_20250504T153118_v0.1.0.fits,2025-05-04T15:31:18.000,2025-05-04T15:31:18.000


### Spectrum Files

In [13]:
processed_files[1].name

'padre_meddea_l1_spectrum_20250504T000000_v0.1.0.fits'

In [14]:
with fits.open(processed_files[1]) as hdul:
    info = hdul.info()
    p_hdr = hdul[0].header
    # Create Astripy Table from the BinTable HDU with the provenance information
    # Assuming the provenance information is in the 4th HDU
    provenance = Table(hdul[3].data)
pprint(info)
pprint(p_hdr)

Filename: /Users/andrewrobbertz/__SOC_CODE__/_data_/PADRE/MEDDEA/L1/spectrum/2025/05/04/padre_meddea_l1_spectrum_20250504T000000_v0.1.0.fits
No.    Name      Ver    Type      Cards   Dimensions   Format
  0  PRIMARY       1 PrimaryHDU      38   ()      
  1  SPEC          1 ImageHDU        30   (512, 24, 2596)   float64   
  2  PKT           1 BinTableHDU     49   2596R x 5C   [J, J, 24I, 24I, I]   
  3  PROVENANCE    1 BinTableHDU     19   8R x 3C   [56A, 23A, 23A]   
None
SIMPLE  =                    T / conforms to FITS standard                      
BITPIX  =                    8 / array data type                                
NAXIS   =                    0 / number of array dimensions                     
EXTEND  =                    T                                                  
DATE    = '2025-06-23T16:11:39.285' / File creation date in UTC                 
AUTHOR  = 'Steven D. Christe'  / Who designed the observation                   
CREATOR = 'padre_meddea'       / Na

In [15]:
provenance

FILENAME,DATE_BEG,DATE_END
str56,str23,str23
padre_meddea_l0test_spectrum_20250504T070411_v0.1.0.fits,2025-05-04T07:04:11.349,2025-05-04T08:15:11.363
padre_meddea_l0test_spectrum_20250504T081521_v0.1.0.fits,2025-05-04T08:15:21.363,2025-05-04T09:26:41.378
padre_meddea_l0test_spectrum_20250504T103811_v0.1.0.fits,2025-05-04T10:38:11.392,2025-05-04T11:49:11.406
padre_meddea_l0test_spectrum_20250504T114921_v0.1.0.fits,2025-05-04T11:49:21.406,2025-05-04T13:00:31.420
padre_meddea_l0test_spectrum_20250504T130041_v0.1.0.fits,2025-05-04T13:00:41.420,2025-05-04T14:12:01.433
padre_meddea_l0test_spectrum_20250504T141211_v0.1.0.fits,2025-05-04T14:12:11.433,2025-05-04T15:23:21.447
padre_meddea_l0test_spectrum_20250504T152331_v0.1.0.fits,2025-05-04T15:23:31.447,2025-05-04T15:31:01.449
padre_meddea_l0test_spectrum_20250504T153111_v0.1.0.fits,2025-05-04T15:31:11.449,2025-05-04T15:33:09.809
