-
Notifications
You must be signed in to change notification settings - Fork 1
/
UPC_resample_datasets.py
368 lines (309 loc) · 15.2 KB
/
UPC_resample_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
# -*- coding: utf-8 -*-
"""
Created: 2021/03/29
@author: Jose M. Alsina (UPC)
with technical support from Deep Solutions (David Pérez, Oscar Serra)
"""
from config import sample_data as cfg
#import UPC_config as cfg
import numpy as np
import pandas as pd
import xarray as xr
import math
from matplotlib import pyplot as plt
import os
import progressbar
import datetime
from dask.diagnostics import ProgressBar
from skimage import measure
from matplotlib import path
from numpy import ma
def extiende_vector(v, minimo, maximo):
pre = []
incr = v[1]-v[0]
cur = v[0] - incr
while cur > minimo:
pre.append(cur)
cur = cur - incr
pre.reverse()
post = []
cur = v[-1] + incr
while cur < maximo:
post.append(cur)
cur = cur + incr
return np.asarray(pre + v.tolist() + post)
def mix_xarrays(a,b, latitude:str='latitude', longitude:str='longitude', variables:list=['u', 'v']):
"""
Superimposes the xarray data with spatial data (lat,lon)
Does not take into account the time dimension, so it must be the same in both arrays
The arrays can have different resolutions with the highest resolution one takiing priority
Resolution: area(lat-lon)/num_data
:param a: xr.DataArray o xr.Dataset with spatial data
:param b: xr.DataArray o xr.Dataset with spatial data
:param latitude: lat coordinate str label
:param longitude: lon coordinate str label
:param variables: list[str] with the names of variables to combine
:return: NEW xarray with combined data.
"""
# Set which array has a larger resolution and obtain the data fromthe maximum resolution
res_a = (a[latitude][-1]-a[latitude][0])*(a[longitude][-1]-a[longitude][0])
res_a = (a[latitude].size * a[longitude].size) / res_a
res_b = (b[latitude][-1]-b[latitude][0])*(b[longitude][-1]-b[longitude][0])
res_b = (b[latitude].size * b[longitude].size) / res_b
if res_b>res_a:
hi_res = b
lo_res = a
else:
hi_res = a
lo_res = b
interp_lo = lo_res.interp(latitude=hi_res[latitude].data, longitude=hi_res[longitude].data, method='linear') #linear and nearest methods return arrays including NaN, while other methods such as cubic or
for v in variables:
ind_nan = np.isnan(hi_res[v].data)
hi_res[v].data[ind_nan] = interp_lo[v].data[ind_nan]
return hi_res
def mix_xarrays2(coastal, ibi, latitude:str= 'latitude', longitude:str= 'longitude', variables:list=['u', 'v']):
"""
Mix information from two xarrays with spatial infromation (lat,lon)
It does not account for the time dimension (time), therefore time needs to be the same in both arrays
Required if no interpolation (prepare files)
:param coastal: xr.DataArray or xr.Dataset with spatial data
:param ibi: xr.DataArray or xr.Dataset with spatial data
:param latitude: str label with infromation of the latitude coordinate
:param longitude: str label with infromation of the longitude coordinate
:param variables: list[str] with names of the varibales to combine
:return: NEW xarray with combined data.
"""
# get the shoreline configuration from initial velocity
# we use the velocity modulus tod efine the limit. Load the initial data
lon = np.array(coastal.longitude)
lat = np.array(coastal.latitude)
U = np.array(coastal.u[0,0,:,:])
V = np.array(coastal.v[0,0,:,:])
Umod = np.sqrt(np.power(U,2) + np.power(V,2))
# Define the shoreline coordinate
shore_mask, shore = Define_landidentifier(lon, lat, Umod)
lo_res = ibi.compute() #we add the compute for indexing, see below. Otherwise we would need to make a loop covering the dataset chunks
hi_res = coastal.compute()
interp_lo = lo_res.interp(latitude=hi_res[latitude].data, longitude=hi_res[longitude].data, method='nearest') #linear and nearest methods return arrays including NaN, while other methods such as cubic or quadratic return all NaN arrays
# Search for values thata re not nana for the variables indicated in the list and assign the low resolution values by interpolation
for v in variables:
ind_nan = np.isnan(hi_res[v].data)
# Assigment here
hi_res[v].data[ind_nan] = interp_lo[v].data[ind_nan]
# Here we use the shoreline mask to define shoreline values after interpolation
hi_res[v].data[:,:,shore_mask[0],shore_mask[1]]=0
hi_res[v].data[:,:,shore[:,0],shore[:,1]]=0
#return interp_lo
return hi_res
def mix_files(files_a, files_b, new_dir):
"""
Opens two folders with spatial data and superimposes them generating new folders in the specified directory
Time dimension and the number of folders must be the same in both directories
:param files_a: list(str) list of paths in folders a
:param files_b: list(str) list of paths in folders b
:param new_dir: str creates destination directory if does not exist
"""
new_dir = Path(new_dir)
new_dir.mkdir(parents=True, exist_ok=True)
assert len(files_b)==len(files_a), "Different number of folders in the lists"
print("Mixing files...")
for i in progressbar.progressbar(range(len(files_a))):
a = xr.open_dataset(files_a[i])
b = xr.open_dataset(files_b[i])
c = mix_xarrays(a, b, latitude='latitude', longitude='longitude', variables=['u', 'v'])
base_filename_a = os.path.splitext(os.path.split(files_a[i])[1])[0]
base_filename_b = os.path.splitext(os.path.split(files_b[i])[1])[0]
res_filename = f'overlap_{base_filename_a}_{base_filename_b}.nc'
c.to_netcdf(os.path.join(new_dir, res_filename))
print(f'Job done. Files can be found at {os.path.abspath(new_dir)}')
def resample_files(files_a, files_b, new_dir):
"""
Saves the new resampled variables in a new dirctory
:param files_a: list(str) list of paths of folders a
:param new_dir: str creates destination directory if does not exist
"""
if not(os.path.isdir(new_dir)):
os.makedirs(new_dir, exist_ok=True)
#assert len(files_b)==len(files_a), "Numero de ficheros distinto en las listas"
print("Storing resampled files in netcdf files...")
for i in progressbar.progressbar(range(len(files_a))):
a = xr.open_dataset(files_a[i])
b = xr.open_dataset(files_b[i])
c = mix_xarrays(a, b, latitude='latitude', longitude='longitude', variables=['u', 'v'])
base_filename_a = os.path.splitext(os.path.split(files_a[i])[1])[0]
base_filename_b = os.path.splitext(os.path.split(files_b[i])[1])[0]
#res_filename = f'overlap_{base_filename_a}_{base_filename_b}.nc'
res_filename = f'{base_filename_a}_resampled.nc'
c.to_netcdf(os.path.join(new_dir, res_filename))
print(f'Job done. Files can be found at {os.path.abspath(new_dir)}')
def resample_files_ibi(filenames_prt, filenames_cst, filenames_ibi, new_dir_ibi, new_dir_ci, new_dir_hc):
"""
Keeps the three datasets in separate directories
IBI: dataset grouped in folders by day and resampled to coastal dates
ci: dataset grouped by day, corresponds to coastal filling NaNs with IBI data
hc: dataset grouped by day, corresponds to coastal filling NaNs with ci data
:param filenames_prt: list(str) list of paths in harbour folder
:param filenames_cst: list(str) list of paths in coastal folder
:param filenames_ibi:[str] list of paths in IBI folder
:param new_dir_ibi: str directory where IBI files to be saved
:param new_dir_ci: str directory where ci files to be saved
:param new_dir_hc: str directory where hc files to be saved
"""
print("Resampling datasets...")
dir_ibi = Path(new_dir_ibi)
dir_ci = Path(new_dir_ci)
dir_hc = Path(new_dir_hc)
dir_ibi.mkdir(parents=True, exist_ok=True)
dir_ci.mkdir(parents=True, exist_ok=True)
dir_hc.mkdir(parents=True, exist_ok=True)
# Abre coastal y IBI
coastal_mf = xr.open_mfdataset(filenames_cst, parallel=True) # loads faster with multiproc
ibidat_mf = xr.open_mfdataset(filenames_ibi, parallel=True)
# Prepara IBI
ibi_prep = prepare_ibi(ibidat_mf, coastal_mf)
plt.pcolormesh(ibi_prep.longitude, ibi_prep.latitude, ibi_prep.u[0, 0, :, :])
# Save IBI
_, datasets_ibi = zip(*ibi_prep.groupby("time.day")) # groupby returns tuples (<dia>,<dataset>), paso del dia
paths_ibi = []
paths_ci = []
paths_hc = []
for dataset in datasets_ibi:
first_date = pd.to_datetime(dataset.time[0].data).date()
paths_ibi.append(dir_ibi/f'IBI_{first_date}_resampled.nc')
paths_ci.append(dir_ci/f'coastal_IBI_{first_date}_resampled.nc')
paths_hc.append(dir_hc/f'harbour_coastal_{first_date}_resampled.nc')
# paths = [f'ibi_{day}_resampled.nc' for day in days]
with ProgressBar():
print("Saving IBI resampled files")
xr.save_mfdataset(datasets_ibi, paths_ibi)
# Fill coastal empty grids with IBI data and deallocate memory
ci = mix_xarrays2(coastal_mf, ibi_prep, latitude='latitude', longitude='longitude', variables=['u', 'v'])
plt.pcolormesh(ci.longitude, ci.latitude, ci.u[0, 0, :, :])
del ibidat_mf
del datasets_ibi
# Resample coastal with IBI
_, datasets_ci = zip(*ci.groupby("time.day"))
with ProgressBar():
print("Saving Coastal-IBI mixed files")
xr.save_mfdataset(datasets_ci, paths_ci)
del datasets_ci
# Load harbour data
harbour_mf = xr.open_mfdataset(filenames_prt, parallel=True)
# Mix harbour y coastal files
hc = mix_xarrays2(harbour_mf, coastal_mf, latitude='latitude', longitude='longitude', variables=['u', 'v'])
plt.pcolormesh(hc.longitude, hc.latitude, hc.u[0, 0, :, :])
plt.title("Three layers data in one image")
# Save harbour and coastal data (resamples) in the same loop
_, datasets_hc = zip(*hc.groupby("time.day"))
with ProgressBar():
print("Saving Harbour-Coastal mixed files")
xr.save_mfdataset(datasets_hc, paths_hc)
print(f'Job done. Files can be found at {os.path.abspath(dir_ibi)}, {os.path.abspath(dir_ci)} and {os.path.abspath(dir_hc)}')
def prepare_ibi(dataset, dataset_no_ibi):
time_prep = dataset.interp(time=dataset_no_ibi.time)
name_prep = time_prep.rename({'vo':'v', 'uo':'u'})
name_prep = name_prep.assign_coords(depth=1.0)
name_prep = name_prep.expand_dims('depth')
name_prep = name_prep.transpose('time','depth','latitude','longitude')
return name_prep
def get_right_files(data_dir, file_search_string):
#get Dates
dl_i = cfg.download_init.split("/")
dl_e = cfg.download_end.split("/")
YYi = [int(dl_i[0]),int(dl_i[1]),int(dl_i[2])]
YYe = [int(dl_e[0]),int(dl_e[1]),int(dl_e[2])]
dt0 = datetime.datetime(YYi[0],YYi[1],YYi[2])
dtend = datetime.datetime(YYe[0],YYe[1],YYe[2])
filenames_list = pd.Series(os.listdir(data_dir))
filenames_order = []
for date in pd.date_range(dt0, dtend, freq = 'D').strftime('%Y%m%d'):
for filenames in filenames_list[filenames_list.str.contains(date)]:
filenames_order.append( Path(data_dir)/Path(filenames))
return filenames_order
def Define_landidentifier(lon,lat,Umod):
# first we need to read the velocity field
Umod[np.isnan(Umod)] = 0
contours = measure.find_contours(Umod, 0)
ishow = False
"""
for contour in contours:
ax.plot(contour[:, 1], contour[:, 0], linewidth=2)
ax.plot(lon[contour[:, 1].astype(int)], lat[contour[:, 0].astype(int)], linewidth=2)
"""
shore_aux = contours[0]
index_min = np.argmin(lon[shore_aux[:, 1].astype(int)])
# find edge points
leftup = (np.where(lat == np.amax(lat) ), np.where( lon == lon[shore_aux[index_min, 1].astype(int)] ))
leftup = np.array(leftup)
leftup = np.reshape(leftup,(1,2))
rightup = (np.where(lat == np.amax(lat) ), np.where(lon == lon[shore_aux[0, 1].astype(int) ] ) )
rightup = np.array(rightup)
rightup = np.reshape(rightup,(1,2))
# append the edge point to the shoreline
shore = shore_aux[0:index_min, :]
shore = shore.astype(int)
shore = np.append(shore,leftup,axis=0)
shore = np.append(shore,rightup,axis=0)
shore = np.append(shore,shore[0:1,0:2],axis=0)
if ishow:
fig, ax = plt.subplots()
ax.pcolormesh(lon, lat, Umod) #port
ax.plot(lon[shore[:, 1]], lat[shore[:, 0]] , linewidth=2)
plt.show()
closed_path = path.Path(shore, closed = True)
idx = np.indices(Umod.shape)
idx2 = idx.reshape(2,len(lon)*len(lat))
inside = closed_path.contains_points(idx2.T)
inside2 = inside.reshape(Umod.shape)
Umod2 = Umod.copy()
Umod2[inside2] = -1
inside3= np.where(inside2)
if ishow:
fig1,(ax1,ax2)=plt.subplots(ncols=2, figsize=(6,3))
cf = ax1.pcolormesh(lon, lat, Umod) #port
fig.colorbar(cf, ax=ax1)
cf2 = ax2.pcolormesh(lon, lat, Umod2) #port
fig.colorbar(cf2, ax=ax2)
plt.show()
shoreline = shore
in_mask = inside3
return in_mask, shoreline
if __name__ == '__main__':
from pathlib import Path
# Directory paths
data_dir_prt = Path(cfg.data_base_path) / Path(cfg.port_files_dir) # Port foles
data_dir_cst = Path(cfg.data_base_path) / Path(cfg.coastal_files_dir) # Coastal files
data_dir_reg = Path(cfg.data_base_path) / Path(cfg.regional_files_dir) # Regional files
filenames_prt = sorted(data_dir_prt.glob(cfg.port_file_search_string)) # files
filenames_prt_order = get_right_files(data_dir_prt, cfg.port_file_search_string)
filenames_cst = sorted(data_dir_cst.glob(cfg.coastal_file_search_string))
filenames_cst_order = get_right_files(data_dir_cst, cfg.coastal_file_search_string)
filenames_reg = sorted(data_dir_reg.glob(cfg.regional_file_search_string))
filenames_reg_order = get_right_files(data_dir_reg, cfg.regional_file_search_string)
filenames_prt = filenames_prt_order
filenames_cst = filenames_cst_order
filenames_reg = filenames_reg_order
# Plot mesh for initial files
# IBI data initial file
ibi1 = filenames_reg[0] # file inicial
ibidat = xr.open_dataset(ibi1)
ibidat_mf = xr.open_mfdataset(filenames_reg, parallel=True)
# plt.pcolormesh(ibidat.longitude, ibidat.latitude, ibidat.uo[0, :, :])
# Coastal data
cst1 = filenames_cst[0] # file inicial
coastal = xr.open_dataset(cst1)
coastal_mf = xr.open_mfdataset(filenames_cst, parallel=True)
# plt.pcolormesh(coastal.longitude, coastal.latitude, coastal.u[0, 0, :, :])
# Harbour data
prt1 = filenames_prt[0]
port = xr.open_dataset(prt1)
#plt.pcolormesh(port.longitude, port.latitude, port.u[0, 0, :, :])
ibi_prepared = prepare_ibi(ibidat_mf, coastal_mf)
d = mix_xarrays2(coastal_mf, ibi_prepared, latitude='latitude', longitude='longitude', variables=['u', 'v'])
plt.pcolormesh(d.longitude, d.latitude, d.u[0, 0, :, :])
# Creates new resampled folders
new_dir_ibi = Path(cfg.data_base_path) / Path(cfg.regional_files_dir + '_resampled')
new_dir_hc = Path(cfg.data_base_path) / Path(cfg.port_files_dir + '_resampled')
new_dir_ci = Path(cfg.data_base_path) / Path(cfg.coastal_files_dir + '_resampled')
resample_files_ibi(filenames_prt, filenames_cst, filenames_reg, new_dir_ibi, new_dir_ci, new_dir_hc)
plt.show()