In [None]:
import xarray as xr
import os
from tqdm import tqdm
import normalizer as norm
import zipfile

In [None]:
def get_files(directory):
      filelist = []
      
      for root, dirs, files in os.walk(directory):
            for f in files:
                 filelist.append(os.path.join(root, f))

      return filelist

In [None]:
def get_night_orbit_files(filelist):
    l = []

    for path in tqdm(filelist, desc="Removing day orbit files"):
        orbit = int(path.split("/")[-1].split("_")[6][-1])
        if orbit == 1:
            l.append(path)

    return l

In [None]:
def get_complete_files(filelist):
    l = []

    for path in tqdm(filelist, desc="Removing incomplete files"):
        try:
            f = xr.open_zarr(path)
        except zipfile.BadZipFile:
            continue

        try:
            t = f["VERSE_TIME"].values
        except KeyError:
            t = [norm.cses_to_unix(i) for i in f["UTCTime"].values.flatten()]

        if abs(t[-1] - t[0]) > 2000:
            l.append(path)

    return l

In [None]:
def remove_invalid_lat_lon(filelist):
    l = []
    
    for path in tqdm(filelist, desc="Removing invalid coordinates"):
        try:
            f = xr.open_zarr(path)
        except zipfile.BadZipFile:
            continue

        try:
            if not (any(abs(lat) > 90 for lat in f['GEO_LAT'].values) or any(abs(lon) > 180 for lon in f['GEO_LON'].values)):
                l.append(path)

        except KeyError:
            if not any(abs(lat) > 90 for lat in f['LonLat'].values[:, 1].squeeze()) or any(abs(lon) > 180 for lon in f['LonLat'].values[:, 0].squeeze()):
                l.append(path)

    return l

In [None]:
def remove_nan_files(filelist):
    l = []
    
    def extract_payload(filename: str):
        filename = filename.split("/")[-1]
        if filename.startswith('CSES_HEP_DDD'):
            return 'HEP_3'
        return filename.split("_")[2] + "_" + filename.split("_")[3]

    mod_payload_params = {
        "HEP_1": ["Count_Electron", "Count_Proton", "A411", "A412"],
        "HEP_2": ["Count_Electron", "Count_Proton", "A411", "A412"],
        "HEP_3": ["HEPD_ele_counts", "HEPD_pro_counts"],
        "HEP_4": ["XrayRate"],
        "EFD_1": ["A111_W", "A112_W", "A113_W", "A111_P", "A112_P", "A113_P"],
        "LAP_1": ["A311", "A321"],
    }

    for path in tqdm(filelist, desc="Removing NaN containing files"):
        try:
            f = xr.open_zarr(path)
        except zipfile.BadZipFile:
            continue

        for i in list(mod_payload_params[extract_payload(path)]):
            try:
                if not f[i].isnull().any():
                    l.append(path)
            except KeyError:
                continue
    
    return l

In [None]:
def get_fixed_data(directory):
    f1 = get_files(directory)
    f2 = get_night_orbit_files(f1)
    f3 = get_complete_files(f2)
    f4 = remove_invalid_lat_lon(f3)
    f5 = remove_nan_files(f4)

    return f5