In [1]:
import cc3d
import netCDF4
import glob
import numpy
import sys
import datetime
import pickle

In [2]:
data_dir = '/tablespace/xcal/'

# Loading the data

In [3]:
def load_imerg(file_path, variable_name):
    netcdf = netCDF4.Dataset(file_path, 'r', format='NETCDF4')
    data = netcdf.groups['Grid'][variable_name][:][0].T[::-1]
    file_header = netcdf.FileHeader.split(';\n')
    file_header.remove('')
    header = {r.split('=')[0]: r.split('=')[1] for r in file_header}
    return data, header

In [4]:
file_paths = sorted(glob.glob(data_dir + 'imerg/3B-HHR.MS.MRG.3IMERG*'))
variable_name = 'precipitationCal'

stack = []
headers = []
timestamps = []
for file_path in file_paths:
    print('{}'.format(len(file_paths)-len(stack)), end='\r')
    sys.stdout.flush()
    data, header = load_imerg(file_path, variable_name)
    stack.append(data)
    headers.append(header)
    timestamps.append(header['StartGranuleDateTime'])
    
data = numpy.array(stack)

timestamps_dt = [datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S.000Z') for date in timestamps]
timestamps_np = numpy.array(timestamps_dt, dtype='datetime64[m]')

1000

In [5]:
with open('{}/pickles/data.pickle'.format(data_dir), 'wb') as f:
    pickle.dump(data, f)

# Thresholding and CCLs

In [6]:
thresh = 0.1
data[data<thresh] = 0
data[data>=thresh] = 1

connectivity = 6 
min_voxels = 100

In [7]:
labels = cc3d.connected_components(data, 
                                   delta=0,
                                   connectivity=connectivity)

In [8]:
labels = cc3d.dust(labels, 
                   threshold=min_voxels, 
                   connectivity=connectivity, 
                   in_place=False)

In [9]:
label_names = numpy.unique(labels[labels>0])

In [12]:
label_names.shape

(94723,)

In [None]:
largest, N = cc3d.largest_k(labels, 
                            k=100, 
                            connectivity=connectivity, 
                            delta=0, 
                            return_N=True)

# Pickle

In [None]:
with open('{}/pickles/largest.pickle'.format(data_dir), 'wb') as f:
    pickle.dump(largest, f)

with open('{}/pickles/labels.pickle'.format(data_dir), 'wb') as f:
    pickle.dump(labels, f)
    
with open('{}/pickles/timestamps.pickle'.format(data_dir), 'wb') as f:
    pickle.dump(timestamps_dt, f)

In [None]:
N