In [4]:
'''
Creates training data for GMI channel predictor
'''


import numpy as np
import xarray as xr
import glob
from tqdm import tqdm
from util_funcs.L1C import scantime2datetime
from util_funcs import data2xarray, array_funcs
import geography

import sensor_info
import local_functions

satellite = sensor_info.satellite
sensor = sensor_info.sensor


datapath = f'/pdata4/archive/GPM/1CR_GMI_V7'

In [2]:
'''
GMI

Get data and quality check for training.

nscans: generally around 3000
npixs: 221 - 50 = 171 (high frequency channels don't go all the way to the
                       swath edges)

    
Tb array will be set up as follows:
    Tbs =  [m x n], where m is the number of samples and n is the
           number of channels (features)
    1-2:   10V and H
    3-4:   19V and H
    5:     24V
    6-7:   37V and H
    8-9:   89V and H
    10-11: 166V and H
    12:    183+-3 V
    13:    183+-7 V
'''

#Set up a random seed for reproducibility
np.random.seed(12)

#Get a random list of files
file_list = glob.glob(f'{datapath}/*/*/1C-R.{satellite}.{sensor}.*.HDF5'); file_list.sort()

good_files = [ifile for ifile in file_list if int(ifile.split('/')[5]) <= 2211]

flist = np.random.choice(good_files, size=100)

#Loop through files and get good quality data.
for i, ifile in enumerate(tqdm(flist, desc="Processing Files")):

    #print(f'{i+1} of {len(flist)}, {ifile}')

    data = local_functions.read_gmi_l1c(ifile)

    lat = data['lat']
    lon = data['lon']
    scantime = data['scantime']
    Tbs = data['Tbs']
    qual = data['qual']

    #Get only good quality data and reshape:
    goodqual = np.all(qual == 0, axis=2)
    all_bad = np.all(goodqual == False)
    if all_bad:
        print('all were bad.')
        continue
    lat = lat[goodqual]
    lon = lon[goodqual]
    scantime = scantime[np.where(goodqual)[0]]
    Tbs = Tbs[goodqual]

    #Check for NaNs (shouldn't be any if all good, but I've seen some)
    nonans = local_functions.find_nan_rows(Tbs, return_good=True)
    Tbs = Tbs[nonans]
    lat = lat[nonans]
    lon = lon[nonans]
    scantime = scantime[nonans]
    

    #Attach GPROF surface map data to each pixel
    sfctype = local_functions.attach_gpm_sfctype(lat, lon, scantime, sensor=sensor)

    npixs = lat.size

    #Output as NetCDF
    dset = data2xarray(data_vars = [lat, lon, scantime, sfctype, Tbs],
                       var_names = ['latitude', 'longitude', 'scantime', 'sfctype', 'Tbs'],
                       dims = [npixs, sensor_info.nfeatures],
                       dim_names = ['pixels', 'channels'])


    if i == 0:
        training_dataset = dset
    else:
        training_dataset = xr.concat((training_dataset, dset), dim='pixels')

training_dataset.to_netcdf(f'training_data/{satellite}_training_data.nc', engine='netcdf4')



Processing Files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [23:08<00:00, 13.89s/it]


In [8]:
with xr.open_dataset(f'training_data/{satellite}_training_data.nc') as f:
    sfctype = f.sfctype.values

for i in range(19):
    print(f'Number of pixels with surface type {i}: {np.sum(sfctype == i)}')

print(f'Total number of ocean pixels: {np.sum(sfctype == 1)}')
print(f'Total number of nonocean pixels: {np.sum(sfctype > 1)}')

Number of pixels with surface type 0: 0
Number of pixels with surface type 1: 31224255
Number of pixels with surface type 2: 2128547
Number of pixels with surface type 3: 2184964
Number of pixels with surface type 4: 1224953
Number of pixels with surface type 5: 1018744
Number of pixels with surface type 6: 534120
Number of pixels with surface type 7: 901619
Number of pixels with surface type 8: 275911
Number of pixels with surface type 9: 381470
Number of pixels with surface type 10: 881218
Number of pixels with surface type 11: 849016
Number of pixels with surface type 12: 1069108
Number of pixels with surface type 13: 710398
Number of pixels with surface type 14: 841322
Number of pixels with surface type 15: 704010
Number of pixels with surface type 16: 217736
Number of pixels with surface type 17: 2455662
Number of pixels with surface type 18: 1437476
Total number of ocean pixels: 31224255
Total number of nonocean pixels: 17816274


In [9]:
flist

array(['/pdata4/archive/GPM/1CR_GMI_V7/1608/160830/1C-R.GPM.GMI.XCAL2016-C.20160830-S124732-E142006.014234.V07A.HDF5',
       '/pdata4/archive/GPM/1CR_GMI_V7/2012/201216/1C-R.GPM.GMI.XCAL2016-C.20201216-S025258-E042529.038634.V07A.HDF5',
       '/pdata4/archive/GPM/1CR_GMI_V7/1708/170822/1C-R.GPM.GMI.XCAL2016-C.20170822-S135442-E152714.019788.V07A.HDF5',
       '/pdata4/archive/GPM/1CR_GMI_V7/2103/210330/1C-R.GPM.GMI.XCAL2016-C.20210330-S091828-E105059.040256.V07A.HDF5',
       '/pdata4/archive/GPM/1CR_GMI_V7/2008/200804/1C-R.GPM.GMI.XCAL2016-C.20200804-S203948-E221222.036561.V07A.HDF5',
       '/pdata4/archive/GPM/1CR_GMI_V7/1511/151103/1C-R.GPM.GMI.XCAL2016-C.20151103-S171742-E185014.009554.V07A.HDF5',
       '/pdata4/archive/GPM/1CR_GMI_V7/1604/160420/1C-R.GPM.GMI.XCAL2016-C.20160420-S223321-E000554.012187.V07A.HDF5',
       '/pdata4/archive/GPM/1CR_GMI_V7/1403/140322/1C-R.GPM.GMI.XCAL2016-C.20140322-S142210-E155437.000357.V07A.HDF5',
       '/pdata4/archive/GPM/1CR_GMI_V7/1701/1701