In [1]:
'''
Creates training data for F13 channel predictor
'''


import numpy as np
import xarray as xr
import glob
from tqdm import tqdm
from util_funcs.L1C import scantime2datetime
from util_funcs import data2xarray, array_funcs
import geography

satellite = 'F13'
sensor = 'SSMI'

datapath = f'/edata1/archive/GPM/1C_{satellite}_V7/'

In [None]:
'''
Satellite: F13

Get data and quality check for training.

nscans: generally around 1600 & 3200
npixs: 64 & 128

Tb array will be set up as follows:
    Tbs =  [m x n], where m is the number of samples and n is the
           number of channels (features)
    1-2:   19.35 V and H
    3:     22.235 V
    4-5:   37.0 V and H
    6-9:   85.5 V and H #Double-sampled 85

Change cell to code to start from scratch.
'''

#Set up a random seed for reproducibility
np.random.seed(12)



#Get a random list of files
file_list = glob.glob(f'{datapath}*/*/1C.{satellite}.SSMI.*.HDF5'); file_list.sort()
#Everything looks good before about 2008:
good_files = [ifile for ifile in file_list if int(ifile.split('/')[5]) < 801 
              and int(ifile.split('/')[5]) != 406 
              and int(ifile.split('/')[5]) != 203]

flist = np.random.choice(good_files, size=200)

tb_dim = 9
qual_dim = 2

#Loop through files and get good quality data.
for i, ifile in enumerate(tqdm(flist, desc="Processing Files")):

    #print(f'{i+1} of {len(flist)}, {ifile}')

    with xr.open_dataset(ifile, group='S1', decode_timedelta=False) as f:
        lat = f.Latitude.values
        lon = f.Longitude.values

    nscans, npixs = lat.shape
    Tbs = np.zeros([nscans,npixs,tb_dim], dtype=np.float32)
    qual = np.zeros([nscans,npixs,qual_dim], dtype=np.int32)

    with xr.open_dataset(ifile, group='S1', decode_timedelta=False) as f:
        qual[:,:,0] = f.Quality.values
        Tbs[:,:,0:2] = f.Tc[:,:,0:2].values #19V and H
        Tbs[:,:,2]   = f.Tc[:,:,2].values #22V
        Tbs[:,:,3:5] = f.Tc[:,:,3:5].values #37V and H

    with xr.open_dataset(ifile, group='S2', decode_timedelta=False) as f:
        #Even-numbered scans line up with S1 Scans (0,2,4,...)
        qual[:,:,1] = f.Quality.values[::2,::2]
        Tbs[:,:,5] = f.Tc.values[::2,::2,0] #85V, even scans, even pixs
        Tbs[:,:,6] = f.Tc.values[::2,::2,1] #85H, even scans, even pixs
        Tbs[:,:,7] = f.Tc.values[::2,1::2,0] #85V, even scans, odd pixs
        Tbs[:,:,8] = f.Tc.values[::2,1::2,1] #85H, even scans, odd pixs

    with xr.open_dataset(ifile, group='S1/ScanTime', decode_timedelta=False) as f:
        scantime = (f.Year.values, f.Month.values, f.DayOfMonth.values, f.Hour.values, f.Minute.values, f.Second.values)

    #Change scan time format from L1C to datetime format for easier use
    scantime = scantime2datetime(scantime)

    #Get only good quality data and reshape:
    goodqual = np.all(qual == 0, axis=2)
    all_bad = np.all(goodqual == False)
    if all_bad:
        print('all were bad.')
        continue
    lat = lat[goodqual]
    lon = lon[goodqual]
    scantime = scantime[np.where(goodqual)[0]]
    Tbs = Tbs[goodqual]

    #Check for NaNs (shouldn't be any if all good, but I've seen some)
    nonans = array_funcs.find_nan_rows(Tbs, return_good=True)
    Tbs = Tbs[nonans]
    lat = lat[nonans]
    lon = lon[nonans]
    scantime = scantime[nonans]
    

    #Attach GPROF surface map data to each pixel
    sfctype = geography.attach_gpm_sfctype(lat, lon, scantime, sensor=sensor)

    npixs = lat.size

    #Output as NetCDF
    dset = data2xarray(data_vars = [lat, lon, scantime, sfctype, Tbs],
                       var_names = ['latitude', 'longitude', 'scantime', 'sfctype', 'Tbs'],
                       dims = [npixs, tb_dim],
                       dim_names = ['pixels', 'channels'])


    if i == 0:
        training_dataset = dset
    else:
        training_dataset = xr.concat((training_dataset, dset), dim='pixels')

training_dataset.to_netcdf(f'training_data/{satellite}_training_data.nc', engine='netcdf4')



Processing Files:  36%|████████▋               | 72/200 [08:50<11:15,  5.28s/it]

all were bad.


Processing Files:  54%|████████████▎          | 107/200 [13:11<11:01,  7.11s/it]

all were bad.


Processing Files:  94%|█████████████████████▌ | 188/200 [24:29<01:31,  7.59s/it]