This file plots all the datapoints for a specific folder of data on a KML pinmap only including files with relevant data
that is, filtered in time and variable names

In [11]:
import numpy as np
import pandas as pd
import xarray as xr
import os
import datetime
import pickle
from tqdm import tqdm
import lxml
from pykml.factory import KML_ElementMaker as KML
from functools import reduce
import operator

In [12]:
#Extracts the data from the dataset ds that overlaps with the time filter and contains valid data
def valid_data_extraction(ds, time_filter, colum_names, file_name):
    #Make sure colum_names is a list
    if type(colum_names) is str:
        colum_names = [colum_names]

    #None of the variables found, retrun immediately
    if not any([c in ds.data_vars for c in colum_names]):
        return None

    #Add longitude, latidude and position_qc as variables indexed by time,depth as all other variables
    TIME = ds['TIME'].values
    DEPTH = ds['DEPTH'].values
    n_DEPTHS = len(DEPTH)

    dataset_columns = {
        'LONG':ds['LONGITUDE'],
        'LAT':ds['LATITUDE'],
        'POS_QC':ds['POSITION_QC'],
    }

    ds_pos = xr.Dataset(
        data_vars=
        {k:(
            ["TIME", 'DEPTH'],
            np.repeat(np.reshape(v.values, (-1,1)), n_DEPTHS, axis=1),
            v.attrs,
        )for (k,v) in dataset_columns.items()},
        coords=dict(
            TIME=TIME,
            DEPTH=DEPTH,
        )
    ).drop_vars('DEPTH')
    ds = xr.merge([ds.drop_dims(['LATITUDE', 'LONGITUDE', 'POSITION']), ds_pos])

    try:
        #Filter for time of interest and depth 0
        if type(time_filter) is tuple:
            ds = ds.sel(TIME=slice(time_filter[0], time_filter[1]), DEPTH=0)
        else:
            ds = ds.sel(TIME=time_filter, DEPTH=0)
    except Exception as e:
        print('Failed to timeslice for ' + file_name)
    
    #Filter only avalible columns
    colum_names = list(set(colum_names).intersection(list(ds.data_vars)))
    colum_names_qc = [c + '_QC' for c in colum_names]
    
    #Add fixed columns
    colum_names.extend(['LONG', 'LAT'])
    colum_names_qc.extend(['TIME_QC', 'POS_QC'])

    #Filter for columns of interest
    ds = ds[colum_names + colum_names_qc]
    
    df = ds.to_dataframe()
    QC_good = [1.0, 7.0]
    
    #Filter for good data only
    filter_qc = [df[c_qc].isin(QC_good) for c_qc in colum_names_qc]
    #Element-wise AND the qc filters
    filter_qc = reduce(operator.and_, filter_qc)
    df = df[filter_qc][colum_names]
    
    #Never return empty result, return none instead
    if df.empty:
        return None
        
    return df

In [13]:
data_dir = '/data/exjobb/sarssw/bouy/INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/MO'
start_date = '2021-01-01'
end_date = '2021-12-31'
variables = ['VHM0', 'VAVH', 'WSPD']

write_folder = './all_data_KML'
save_var_fn = 'result' #to save data_dir, df_files tuple
kml_pinmap_fn = 'kml_pinmap'

files = os.listdir(data_dir)

df_files = pd.DataFrame({
    'file':pd.Series(dtype=object),
    'LONG':pd.Series(dtype=float),
    'LAT':pd.Series(dtype=float),
    'VARS':object,
    'LEN':int,
    'MOVING':bool,
}).rename_axis('file')

for file in tqdm(files):
    #print(file)
    #Open dataset
    ds = xr.open_dataset(os.path.join(data_dir, file))#, engine='scipy')

    #Filter in time and for quality data
    df_data = valid_data_extraction(ds, (start_date, end_date), variables, file)

    if df_data is not None:
        #Take only average location since some bouys are non-stationary
        ave_long = df_data['LONG'].mean()
        ave_lat = df_data['LAT'].mean()
        data_vars = set(df_data.columns).intersection(variables)
        data_len = df_data.shape[0]

        #if it moves too much print warning and skip
        min_long, max_long = df_data['LONG'].min(), df_data['LONG'].max()
        long_diff = max_long - min_long
        min_lat, max_lat = df_data['LAT'].min(), df_data['LAT'].max()
        lat_diff = max_lat - min_lat

        if max(long_diff, lat_diff) > 0.01: #TODO set better limit?
            print(file, ' Is moving too much')
            df_files.loc[len(df_files.index)] = [file, ave_long, ave_lat, data_vars, data_len, True]
            continue

        df_files.loc[len(df_files.index)] = [file, ave_long, ave_lat, data_vars, data_len, False]

  0%|          | 11/2488 [00:01<04:22,  9.42it/s]

GL_TS_MO_23014.nc  Is moving too much


  1%|          | 15/2488 [00:01<03:19, 12.37it/s]

GL_TS_MO_13010.nc  Is moving too much


  1%|          | 21/2488 [00:02<04:38,  8.85it/s]

IR_TS_MO_Tenerife-coast-buoy.nc  Is moving too much


  2%|▏         | 46/2488 [00:03<02:38, 15.38it/s]

GL_TS_MO_23009.nc  Is moving too much


  indexer = index.slice_indexer(


Failed to timeslice for MO_TS_MO_Molo-Bandiera.nc


  4%|▍         | 108/2488 [00:07<02:09, 18.37it/s]

IR_TS_MO_Malaga-coast-buoy.nc  Is moving too much


  6%|▌         | 142/2488 [00:11<04:54,  7.97it/s]

GL_TS_MO_55018.nc  Is moving too much


 14%|█▍        | 355/2488 [00:29<02:40, 13.29it/s]

GL_TS_MO_55036.nc  Is moving too much


 16%|█▋        | 407/2488 [00:34<03:11, 10.89it/s]

GL_TS_MO_1500008.nc  Is moving too much


 21%|██        | 511/2488 [00:41<02:09, 15.25it/s]

GL_TS_MO_4100300.nc  Is moving too much


 21%|██        | 522/2488 [00:42<02:58, 11.02it/s]

GL_TS_MO_55020.nc  Is moving too much


  indexer = index.slice_indexer(
 22%|██▏       | 543/2488 [00:44<02:52, 11.29it/s]

Failed to timeslice for MO_TS_MO_NADR-S1.nc
MO_TS_MO_NADR-S1.nc  Is moving too much


 23%|██▎       | 565/2488 [00:45<02:26, 13.10it/s]

GL_TS_MO_55032.nc  Is moving too much


 23%|██▎       | 580/2488 [00:46<01:40, 19.06it/s]

GL_TS_MO_55029.nc  Is moving too much


 23%|██▎       | 583/2488 [00:47<03:33,  8.93it/s]

BO_TS_MO_Norrbyn.nc  Is moving too much


 24%|██▍       | 599/2488 [00:48<02:52, 10.95it/s]

GL_TS_MO_55022.nc  Is moving too much


 29%|██▉       | 720/2488 [00:57<03:01,  9.73it/s]

GL_TS_MO_55033.nc  Is moving too much


 32%|███▏      | 788/2488 [01:02<02:15, 12.56it/s]

GL_TS_MO_55046.nc  Is moving too much


  indexer = index.slice_indexer(


Failed to timeslice for MO_TS_MO_VIDA.nc


 35%|███▍      | 859/2488 [01:08<01:20, 20.21it/s]

GL_TS_MO_55017.nc  Is moving too much


 35%|███▌      | 883/2488 [01:09<01:05, 24.59it/s]

GL_TS_MO_41121.nc  Is moving too much


 36%|███▌      | 894/2488 [01:09<01:06, 24.05it/s]

GL_TS_MO_31004.nc  Is moving too much


 36%|███▌      | 900/2488 [01:10<01:27, 18.17it/s]

GL_TS_MO_55027.nc  Is moving too much
GL_TS_MO_55045.nc  Is moving too much


 38%|███▊      | 935/2488 [01:13<02:31, 10.25it/s]

GL_TS_MO_1500009.nc  Is moving too much


 39%|███▊      | 964/2488 [01:15<01:27, 17.40it/s]

GL_TS_MO_2300453.nc  Is moving too much


 39%|███▉      | 970/2488 [01:16<02:33,  9.91it/s]

GL_TS_MO_55028.nc  Is moving too much


 40%|███▉      | 985/2488 [01:17<02:10, 11.54it/s]

GL_TS_MO_55037.nc  Is moving too much


 45%|████▍     | 1110/2488 [01:26<01:27, 15.76it/s]

GL_TS_MO_55030.nc  Is moving too much


 50%|████▉     | 1243/2488 [01:35<01:44, 11.88it/s]

GL_TS_MO_23016.nc  Is moving too much


 52%|█████▏    | 1306/2488 [01:39<01:09, 17.05it/s]

GL_TS_MO_53040.nc  Is moving too much


 53%|█████▎    | 1324/2488 [01:41<01:17, 14.96it/s]

GL_TS_MO_14041.nc  Is moving too much
GL_TS_MO_2300452.nc  Is moving too much


 55%|█████▌    | 1377/2488 [01:46<01:03, 17.49it/s]

GL_TS_MO_2300091.nc  Is moving too much


 59%|█████▉    | 1478/2488 [01:55<01:21, 12.37it/s]

GL_TS_MO_23013.nc  Is moving too much


 62%|██████▏   | 1540/2488 [02:00<01:03, 14.91it/s]

GL_TS_MO_55038.nc  Is moving too much


  indexer = index.slice_indexer(
 63%|██████▎   | 1559/2488 [02:03<01:54,  8.13it/s]

Failed to timeslice for MO_TS_MO_ESTELLENCS.nc


 65%|██████▍   | 1605/2488 [02:05<00:44, 19.95it/s]

GL_TS_MO_2300454.nc  Is moving too much


 66%|██████▋   | 1651/2488 [02:09<01:00, 13.74it/s]

GL_TS_MO_56005.nc  Is moving too much


 69%|██████▉   | 1715/2488 [02:13<00:49, 15.73it/s]

GL_TS_MO_31003.nc  Is moving too much


 72%|███████▏  | 1794/2488 [02:19<00:44, 15.50it/s]

GL_TS_MO_4800400.nc  Is moving too much


 81%|████████  | 2010/2488 [02:35<00:27, 17.13it/s]

GL_TS_MO_52121.nc  Is moving too much


 85%|████████▍ | 2108/2488 [02:42<00:17, 21.96it/s]

GL_TS_MO_2300451.nc  Is moving too much


 88%|████████▊ | 2193/2488 [02:48<00:20, 14.55it/s]

GL_TS_MO_31005.nc  Is moving too much


 91%|█████████▏| 2271/2488 [02:53<00:17, 12.40it/s]

GL_TS_MO_52087.nc  Is moving too much


 92%|█████████▏| 2296/2488 [02:54<00:08, 23.21it/s]

GL_TS_MO_15002.nc  Is moving too much
Failed to timeslice for BS_TS_MO_VarnaBuoySURF.nc


  indexer = index.slice_indexer(
 95%|█████████▍| 2354/2488 [02:58<00:09, 14.55it/s]

GL_TS_MO_2300497.nc  Is moving too much


 95%|█████████▍| 2356/2488 [02:58<00:09, 13.69it/s]

GL_TS_MO_55024.nc  Is moving too much


  indexer = index.slice_indexer(
 99%|█████████▉| 2460/2488 [03:06<00:01, 18.13it/s]

Failed to timeslice for BS_TS_MO_BurgasBuoySURF.nc


100%|██████████| 2488/2488 [03:09<00:00, 13.13it/s]


In [14]:
#df_data

In [17]:
df_files[df_files['MOVING']]

Unnamed: 0_level_0,file,LONG,LAT,VARS,LEN,MOVING
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,GL_TS_MO_23014.nc,64.956444,1.743718,{WSPD},92,True
5,GL_TS_MO_13010.nc,0.012565,0.013804,{WSPD},46,True
6,IR_TS_MO_Tenerife-coast-buoy.nc,-16.239906,28.459993,"{VAVH, VHM0}",8529,True
10,GL_TS_MO_23009.nc,89.049072,15.038194,{WSPD},150,True
19,IR_TS_MO_Malaga-coast-buoy.nc,-4.440383,36.659534,"{VAVH, VHM0}",8765,True
26,GL_TS_MO_55018.nc,153.278244,-30.367559,{VAVH},5695,True
63,GL_TS_MO_55036.nc,153.510361,-28.111446,{VAVH},12118,True
73,GL_TS_MO_1500008.nc,-10.752615,-19.828106,{WSPD},186,True
86,GL_TS_MO_4100300.nc,-57.478683,15.850313,{WSPD},5473,True
88,GL_TS_MO_55020.nc,150.199234,-37.257584,{VAVH},6574,True


In [16]:
#Conditionally creates the folder for the result
os.makedirs(write_folder, exist_ok=True)

#Save datadir and resulting dataframe
with open(os.path.join(write_folder, save_var_fn),'wb') as f_w:
    pickle.dump((data_dir, df_files), f_w)

#Create kml map
KML_fldr = KML.Folder(
    KML.name("All files"),
    KML.description(
        "Timeinterval: " + start_date + " to " + end_date + '\n' + str(variables)
    )
)

#iterate over all unique variable name, file name pairs
for index,row in df_files.iterrows():
    #extract longitude and latitude
    long = row['LONG']
    lat = row['LAT']
    
    #Create the pin
    pin = KML.Placemark(
        KML.name(row['LEN']),
        KML.description(row['file'] + '\n' + str(row['LEN']) + " datapoints"),
        KML.Point(
            KML.coordinates(str(long) + "," + str(lat))
        )
    )
    KML_fldr.append(pin)
    
with open(os.path.join(write_folder, kml_pinmap_fn + '.kml'), 'w') as f_w:
    f_w.write(lxml.etree.tostring(KML_fldr, pretty_print=True).decode())