This notebook is used to explore on what depth the sensors measureing the variables that we are interested in are located.
Before we have assumed that all of our data is found on depth=0

Observations:
There are many entries with non variable value and 9 asn quality control
There are 2 files (MO_TS_MO_6101629.nc, MO_TS_MO_6101628.nc VH) that have none for QC but random variable values

Conclusion:
Wave height (VHM0, VAVH) is almost entirely locate at depth 0
Wind speed (WSPD) is found on all heights (negative depths)

In [2]:
import numpy as np
import pandas as pd
import xarray as xr
import os
from collections import defaultdict
from tqdm import tqdm

In [3]:
#This function returns a list of (variable value, quality control value, depth index, deph quality control, deph value) for all entries in the 
#dataset ds with variable value not NAN or valid column quality control value
#if exclude_variable=True we only consider valid column quality control values and ignore variable value not NAN 
def not_nan_index_ds(ds, variable_name, exclude_variable_value=False):
    var_data = ds[variable_name].values
    var_qc = ds[variable_name+'_QC'].values
    deph_qc = ds['DEPH_QC'].values
    if 'DEPH' in ds.coords:
        deph_var = ds.coords['DEPH'].values
    else:
        deph_var = ds['DEPH'].values
    return not_nan_index(var_data, var_qc, deph_qc, deph_var, exclude_variable_value)

def not_nan_index(var_data, var_qc, deph_qc, deph_var, exclude_variable_value=False):
    if len(var_data.shape) != 2:
        raise ValueError("Array is not 2 dimensional")

    qc_valid = np.isin(var_qc, [0,1,2,3,4,5,6,7,8])

    if exclude_variable_value:
        indexes = np.argwhere(qc_valid)
        not_nan_list = [(var_qc[tuple(i)], i[1], deph_qc[tuple(i)], deph_var[tuple(i)]) for i in indexes] #tuple on form qc value, index(2nd dim)
    else:
        var_not_nan = ~np.isnan(var_data)
        indexes = np.argwhere(var_not_nan | qc_valid)
        not_nan_list = [(var_data[tuple(i)] ,var_qc[tuple(i)], i[1], deph_qc[tuple(i)], deph_var[tuple(i)]) for i in indexes] #triple on form datavalue, qc value, index(2nd dim)
    return not_nan_list

#Returns the count on qc, index pairs found in the column in dataset ds
def count_indexes(ds, column):
    not_nan_list = not_nan_index_ds(ds, column,exclude_variable_value=True)
    if len(not_nan_list) == 0:
        return None
    
    return pd.Series(not_nan_list).value_counts()

In [4]:
#To debug the functions

data = np.array([
    [np.nan,np.nan,np.nan],
    [17,np.nan,np.nan],
    [np.nan,13,np.nan],
    [np.nan,221,np.nan],
    [np.nan,np.nan,13],
    [1,2,3],
    [np.nan,np.nan,np.nan],
])

qc = np.array([
    [0,1,2],
    [3,np.nan,np.nan],
    [np.nan,4,np.nan],
    [np.nan,5,np.nan],
    [np.nan,np.nan,6],
    [7,8,9],
    [9,np.nan,np.nan],
])

deph_qc = np.array([
    [0,0,0],
    [1,1,1],
    [2,2,2],
    [3,3,3],
    [4,4,4],
    [7,8,9],
])

deph = np.array([
    [0,12,30],
    [0,12,30],
    [0,12,30],
    [0,12,30],
    [0,12,30],
    [0,12,30],
])

result = not_nan_index(data, qc, deph_qc, deph)
print(result)
print(np.isnan([i for t in result for i in t]).sum())

[(nan, 0.0, 0, 0, 0), (nan, 1.0, 1, 0, 12), (nan, 2.0, 2, 0, 30), (17.0, 3.0, 0, 1, 0), (13.0, 4.0, 1, 2, 12), (221.0, 5.0, 1, 3, 12), (13.0, 6.0, 2, 4, 30), (1.0, 7.0, 0, 7, 0), (2.0, 8.0, 1, 8, 12), (3.0, 9.0, 2, 9, 30)]
3


In [5]:
data_dir = '/data/exjobb/sarssw/bouy/INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/MO'
all_variables = set(['VHM0', 'VAVH', 'WSPD'])
files = os.listdir(data_dir)

In [6]:
test_ds = xr.open_dataset(os.path.join(data_dir, 'BO_TS_MO_Koster.nc'))
test_ds

In [7]:
#DEPTH is somteimes locad in coords ans sometimes not!
for file in (files[:10]):
    #Load the data from the file
    file_path = os.path.join(data_dir, file)
    ds = xr.open_dataset(file_path)# , engine='scipy')
    if 'DEPH' in ds.data_vars:
        print(file, ' in coords')
    else:
        print(file, ' NOT in coords')

NO_TS_MO_FINO1.nc  NOT in coords
GL_TS_MO_6801028.nc  in coords
GL_TS_MO_46044.nc  in coords
GL_TS_MO_T8N170W.nc  in coords
GL_TS_MO_6200059.nc  in coords
GL_WS_MO_46079.nc  NOT in coords
BS_TS_MO_33835.nc  in coords
MO_TS_MO_Olympiada.nc  in coords
GL_TS_MO_23014.nc  in coords
GL_TS_MO_41110.nc  in coords


In [8]:
#Find the dimensions of depth
max_depth_dim = 0

for file in tqdm(files):
    #Load the data from the file
    file_path = os.path.join(data_dir, file)
    ds = xr.open_dataset(file_path)# , engine='scipy')
    #Skip those files without data of interest,
    if not any([c in ds.data_vars for c in all_variables]):
        continue

    depth_dim = len(ds['DEPTH'].values)
    if depth_dim > max_depth_dim:
        max_depth_dim = depth_dim
        print(file, max_depth_dim)

  0%|          | 4/2488 [00:00<02:18, 17.99it/s]

NO_TS_MO_FINO1.nc 7
GL_TS_MO_T8N170W.nc 13


  0%|          | 12/2488 [00:00<01:22, 29.87it/s]

GL_TS_MO_23014.nc 16


  2%|▏         | 44/2488 [00:03<03:03, 13.31it/s]

GL_TS_MO_23009.nc 18


  2%|▏         | 54/2488 [00:03<02:18, 17.57it/s]

BO_TS_MO_Kristineberg3.nc 21


  5%|▌         | 129/2488 [00:08<03:42, 10.59it/s]

MO_TS_MO_6101629.nc 22


  5%|▌         | 131/2488 [00:09<04:39,  8.43it/s]

GL_TS_MO_T0N165E.nc 23


 20%|██        | 509/2488 [00:41<03:55,  8.40it/s]

MO_TS_MO_61141.nc 28


 21%|██        | 515/2488 [00:42<03:12, 10.26it/s]

AR_TS_MO_C-Sulafjorden.nc 50


 72%|███████▏  | 1803/2488 [02:15<00:52, 12.93it/s]

GL_TS_MO_Mesurho.nc 51


 74%|███████▍  | 1845/2488 [02:19<00:53, 12.00it/s]

IR_TS_MO_6200085.nc 59


 95%|█████████▍| 2357/2488 [02:54<00:05, 22.12it/s]

BO_TS_MO_Koster.nc 70


100%|██████████| 2488/2488 [03:04<00:00, 13.50it/s]


In [9]:
#Analyse if we have nan values combined with non nan for the variable and quality control
result_df = pd.DataFrame({c: pd.Series(dtype=t) for c, t in {
            'file':str,
            'variable':str,
            'not_nan':object,

        }.items()})


for file in tqdm(files):
    #print('\n',file)

    #Load the data from the file
    file_path = os.path.join(data_dir, file)
    ds = xr.open_dataset(file_path)# , engine='scipy')
    common_vars =  all_variables.intersection(list(ds.data_vars))

    for var in common_vars:
        result = not_nan_index_ds(ds, var)

        if np.isnan([i for t in result for i in t]).sum() > 0:
            print(file, var)
            result_df.loc[len(result_df.index)] = [
                file,
                var,
                result,
            ]

  5%|▌         | 128/2488 [01:10<21:01,  1.87it/s]

MO_TS_MO_6101629.nc VHM0


 73%|███████▎  | 1815/2488 [19:29<05:47,  1.94it/s] 

MO_TS_MO_6101628.nc VHM0


100%|██████████| 2488/2488 [26:09<00:00,  1.58it/s]


In [10]:
for i,row in result_df.iterrows():
    print(row)
    print(type(row))
    row_result = pd.Series(row['not_nan'])
    row_not_nan_index =  row_result.apply(lambda t: any([np.isnan(i) for i in t]))
    print(row['file'], row_result[row_not_nan_index])

file                                      MO_TS_MO_6101629.nc
variable                                                 VHM0
not_nan     [(0.0780000037048012, 1.0, 0, 0.0, 0.0), (0.15...
Name: 0, dtype: object
<class 'pandas.core.series.Series'>
MO_TS_MO_6101629.nc 22362      (99999.00474969763, nan, 1, 0.0, 0.5)
22363      (99999.00474969763, nan, 2, 0.0, 2.0)
22364      (99999.00474969763, nan, 3, 0.0, 3.0)
22365      (99999.00474969763, nan, 4, 0.0, 4.0)
22366      (99999.00474969763, nan, 5, 0.0, 5.0)
                          ...                   
53574    (99999.00474969763, nan, 17, 0.0, 17.0)
53575    (99999.00474969763, nan, 18, 0.0, 18.0)
53576    (99999.00474969763, nan, 19, 0.0, 19.0)
53577    (99999.00474969763, nan, 20, 0.0, 20.0)
53578    (99999.00474969763, nan, 21, 0.0, 21.0)
Length: 29799, dtype: object
file                                      MO_TS_MO_6101628.nc
variable                                                 VHM0
not_nan     [(0.0780000037048012, 1.0, 0, 0

In [11]:
#Analyse the occurence of QC value, depth index pairs
result_df = pd.DataFrame({c: pd.Series(dtype=t) for c, t in {
            'file':str,
            'variable':str,
            'qc_index_count':object,
        }.items()})


for file in tqdm(files):
    #print('\n',file)

    #Load the data from the file
    file_path = os.path.join(data_dir, file)
    ds = xr.open_dataset(file_path)# , engine='scipy')
    common_vars =  all_variables.intersection(list(ds.data_vars))

    for var in common_vars:
        result = count_indexes(ds, var)
        if result is not None:
            result_df.loc[len(result_df.index)] = [
                file,
                var,
                result,
            ]

100%|██████████| 2488/2488 [19:28<00:00,  2.13it/s] 


In [12]:
result_df

Unnamed: 0,file,variable,qc_index_count
0,NO_TS_MO_FINO1.nc,VHM0,"(1.0, 0, 7.0, 0.0) 273742 (4.0, 0, 7.0, 0.0..."
1,GL_TS_MO_46044.nc,VHM0,"(1.0, 2, 7.0, 0.0) 17648 dtype: int64"
2,GL_TS_MO_46044.nc,WSPD,"(1.0, 0, 7.0, -5.0) 17657 dtype: int64"
3,GL_TS_MO_T8N170W.nc,WSPD,"(1.0, 0, 1.0, -4.0) 804684 (4.0, 0, 1.0, -4..."
4,GL_TS_MO_6200059.nc,VHM0,"(1.0, 0, 7.0, 0.0) 185739 (4.0, 0, 7.0, 0.0..."
...,...,...,...
2418,GL_TS_MO_44090.nc,VHM0,"(1.0, 0, 7.0, 0.0) 101056 dtype: int64"
2419,GL_TS_MO_44076.nc,VHM0,"(1.0, 3, 7.0, 0.0) 11337 dtype: int64"
2420,GL_TS_MO_44076.nc,WSPD,"(1.0, 0, 7.0, -4.7) 76094 dtype: int64"
2421,GL_TS_MO_41060.nc,VHM0,"(1.0, 0, 7.0, 0.0) 43620 (4.0, 0, 7.0, 0.0)..."


In [13]:
#Accumulate all the result to a dict with entries for each variable
accum_result = {c:defaultdict(int) for c in all_variables}
for i, row in result_df.iterrows():
    for ii,c in row['qc_index_count'].items():
        accum_result[row['variable']][ii] += c

print('The result for each variable type is reported on the format: (variable qality control value, depth index, depth qality control, depth varable value) : count of occurences')
accum_result

The result for each variable type is reported on the format: (variable qality control value, depth index, depth qality control, depth varable value) : count of occurences


{'VHM0': defaultdict(int,
             {(1.0, 0, 7.0, 0.0): 73607128,
              (4.0, 0, 7.0, 0.0): 1312365,
              (3.0, 0, 7.0, 0.0): 19134,
              (1.0, 2, 7.0, 0.0): 5487043,
              (1.0, 3, 7.0, 0.0): 23453197,
              (4.0, 3, 7.0, 0.0): 30700,
              (1.0, 1, 7.0, 0.0): 6711311,
              (4.0, 1, 7.0, 0.0): 106304,
              (3.0, 3, 7.0, 0.0): 12611,
              (1.0, 0, 0.0, 0.0): 165427,
              (4.0, 0, 0.0, 0.0): 25371,
              (3.0, 0, 0.0, 0.0): 812,
              (4.0, 2, 7.0, 0.0): 12401,
              (2.0, 0, 7.0, 0.0): 721,
              (1.0, 1, 1.0, 0.0): 340040,
              (4.0, 1, 1.0, 0.0): 4658,
              (3.0, 1, 1.0, 0.0): 311,
              (1.0, 0, 1.0, 0.0): 54545,
              (4.0, 1, 4.0, 0.0): 4148,
              (1.0, 1, 4.0, 0.0): 153,
              (3.0, 1, 7.0, 0.0): 180,
              (2.0, 1, 7.0, 0.0): 3873,
              (1.0, 1, 0.0, 0.0): 33440,
              (4.0, 1, 0.0, 0