This file plots all the datapoints for a specific folder of data on a KML pinmap only including files with relevant data
that is, filtered in time and variable names

In [88]:
import numpy as np
import pandas as pd
import xarray as xr
import os
import datetime
import pickle
from tqdm import tqdm
import lxml
from pykml.factory import KML_ElementMaker as KML
from functools import reduce
import operator

In [89]:
#Extracts the data from the dataset ds that overlaps with the time filter and contains valid data
def valid_data_extraction(ds, time_filter, colum_names, file_name):
    #Make sure colum_names is a list
    if type(colum_names) is str:
        colum_names = [colum_names]

    #None of the variables found, retrun immediately
    if not any([c in ds.data_vars for c in colum_names]):
        return None

    #Add longitude, latidude and position_qc as variables indexed by time,depth as all other variables
    TIME = ds['TIME'].values
    DEPTH = ds['DEPTH'].values
    n_DEPTHS = len(DEPTH)

    dataset_columns = {
        'LONG':ds['LONGITUDE'],
        'LAT':ds['LATITUDE'],
        'POS_QC':ds['POSITION_QC'],
    }

    ds_pos = xr.Dataset(
        data_vars=
        {k:(
            ["TIME", 'DEPTH'],
            np.repeat(np.reshape(v.values, (-1,1)), n_DEPTHS, axis=1),
            v.attrs,
        )for (k,v) in dataset_columns.items()},
        coords=dict(
            TIME=TIME,
            DEPTH=DEPTH,
        )
    ).drop_vars('DEPTH')
    ds = xr.merge([ds.drop_dims(['LATITUDE', 'LONGITUDE', 'POSITION']), ds_pos])

    try:
        #Filter for time of interest and depth 0
        if type(time_filter) is tuple:
            ds = ds.sel(TIME=slice(time_filter[0], time_filter[1]), DEPTH=0)
        else:
            ds = ds.sel(TIME=time_filter, DEPTH=0)
    except Exception as e:
        print('Failed to timeslice for ' + file_name)
    
    #Filter only avalible columns
    colum_names = list(set(colum_names).intersection(list(ds.data_vars)))
    colum_names_qc = [c + '_QC' for c in colum_names]
    
    #Add fixed columns
    colum_names.extend(['LONG', 'LAT'])
    colum_names_qc.extend(['TIME_QC', 'POS_QC'])

    #Filter for columns of interest
    ds = ds[colum_names + colum_names_qc]
    
    df = ds.to_dataframe()
    QC_good = [1.0, 7.0]
    
    #Filter for good data only
    filter_qc = [df[c_qc].isin(QC_good) for c_qc in colum_names_qc]
    #Element-wise AND the qc filters
    filter_qc = reduce(operator.and_, filter_qc)
    df = df[filter_qc][colum_names]
    
    #Never return empty result, return none instead
    if df.empty:
        return None
        
    return df

In [90]:
data_dir = '/data/exjobb/sarssw/bouy/INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/MO'
start_date = '2021-01-01'
end_date = '2021-12-31'
variables = ['VHM0', 'VAVH', 'WSPD']

write_folder = './all_data_KML'
save_var_fn = 'result' #to save data_dir, df_files tuple
kml_pinmap_fn = 'kml_pinmap'

files = os.listdir(data_dir)

df_files = pd.DataFrame({
    'file':pd.Series(dtype=object),
    'LONG':pd.Series(dtype=float),
    'LAT':pd.Series(dtype=float),
    'VARS':object,
    'LEN':int,
}).rename_axis('file')

for file in tqdm(files):
    #print(file)
    #Open dataset
    ds = xr.open_dataset(os.path.join(data_dir, file))#, engine='scipy')

    #Filter in time and for quality data
    df_data = valid_data_extraction(ds, (start_date, end_date), variables, file)

    if df_data is not None:
        #if it moves too much print warning and skip
        min_long, max_long = df_data['LONG'].min(), df_data['LONG'].max()
        long_diff = max_long - min_long
        min_lat, max_lat = df_data['LAT'].min(), df_data['LAT'].max()
        lat_diff = max_lat - min_lat

        if max(long_diff, lat_diff) > 0.01: #TODO set better limit?
            print(file, ' Is moving too much')
            continue

        #Take only average location since some bouys are non-stationary
        ave_long = df_data['LONG'].mean()
        ave_lat = df_data['LAT'].mean()
        data_vars = set(df_data.columns).intersection(variables)
        data_len = df_data.shape[0]

        df_files.loc[len(df_files.index)] = [file, ave_long, ave_lat,data_vars,data_len]

  0%|          | 11/2488 [00:01<03:35, 11.50it/s]

GL_TS_MO_23014.nc  Is moving too much


  1%|          | 14/2488 [00:01<02:51, 14.41it/s]

GL_TS_MO_13010.nc  Is moving too much


  1%|          | 21/2488 [00:02<03:40, 11.17it/s]

IR_TS_MO_Tenerife-coast-buoy.nc  Is moving too much


  2%|▏         | 49/2488 [00:03<01:48, 22.40it/s]

GL_TS_MO_23009.nc  Is moving too much


  indexer = index.slice_indexer(


Failed to timeslice for MO_TS_MO_Molo-Bandiera.nc


  4%|▍         | 108/2488 [00:07<02:24, 16.52it/s]

IR_TS_MO_Malaga-coast-buoy.nc  Is moving too much


  6%|▌         | 142/2488 [00:11<05:08,  7.62it/s]

GL_TS_MO_55018.nc  Is moving too much


 14%|█▍        | 355/2488 [00:28<02:36, 13.66it/s]

GL_TS_MO_55036.nc  Is moving too much


 16%|█▋        | 407/2488 [00:34<02:54, 11.90it/s]

GL_TS_MO_1500008.nc  Is moving too much


 21%|██        | 512/2488 [00:41<02:22, 13.89it/s]

GL_TS_MO_4100300.nc  Is moving too much


 21%|██        | 522/2488 [00:42<03:26,  9.53it/s]

GL_TS_MO_55020.nc  Is moving too much


  indexer = index.slice_indexer(
 22%|██▏       | 543/2488 [00:43<02:22, 13.60it/s]

Failed to timeslice for MO_TS_MO_NADR-S1.nc
MO_TS_MO_NADR-S1.nc  Is moving too much


 23%|██▎       | 566/2488 [00:45<01:50, 17.44it/s]

GL_TS_MO_55032.nc  Is moving too much


 23%|██▎       | 581/2488 [00:46<02:05, 15.20it/s]

GL_TS_MO_55029.nc  Is moving too much


 23%|██▎       | 583/2488 [00:46<04:00,  7.92it/s]

BO_TS_MO_Norrbyn.nc  Is moving too much


 24%|██▍       | 599/2488 [00:48<02:51, 11.00it/s]

GL_TS_MO_55022.nc  Is moving too much


 29%|██▉       | 719/2488 [00:57<02:38, 11.15it/s]

GL_TS_MO_55033.nc  Is moving too much


 32%|███▏      | 791/2488 [01:02<02:27, 11.49it/s]

GL_TS_MO_55046.nc  Is moving too much


  indexer = index.slice_indexer(


Failed to timeslice for MO_TS_MO_VIDA.nc


 34%|███▍      | 858/2488 [01:08<01:32, 17.66it/s]

GL_TS_MO_55017.nc  Is moving too much


 35%|███▌      | 882/2488 [01:10<01:05, 24.41it/s]

GL_TS_MO_41121.nc  Is moving too much


 36%|███▌      | 894/2488 [01:10<01:15, 21.06it/s]

GL_TS_MO_31004.nc  Is moving too much


 36%|███▌      | 900/2488 [01:11<01:47, 14.73it/s]

GL_TS_MO_55027.nc  Is moving too much


 36%|███▋      | 904/2488 [01:11<01:50, 14.33it/s]

GL_TS_MO_55045.nc  Is moving too much


 38%|███▊      | 934/2488 [01:14<02:51,  9.05it/s]

GL_TS_MO_1500009.nc  Is moving too much


 39%|███▉      | 965/2488 [01:16<01:24, 18.03it/s]

GL_TS_MO_2300453.nc  Is moving too much


 39%|███▉      | 969/2488 [01:17<02:11, 11.59it/s]

GL_TS_MO_55028.nc  Is moving too much


 39%|███▉      | 982/2488 [01:18<02:38,  9.52it/s]

GL_TS_MO_55037.nc  Is moving too much


 45%|████▍     | 1110/2488 [01:26<01:36, 14.34it/s]

GL_TS_MO_55030.nc  Is moving too much


 50%|████▉     | 1243/2488 [01:36<01:54, 10.90it/s]

GL_TS_MO_23016.nc  Is moving too much


 52%|█████▏    | 1306/2488 [01:41<01:03, 18.50it/s]

GL_TS_MO_53040.nc  Is moving too much


 53%|█████▎    | 1325/2488 [01:42<01:05, 17.89it/s]

GL_TS_MO_14041.nc  Is moving too much
GL_TS_MO_2300452.nc  Is moving too much


 56%|█████▌    | 1382/2488 [01:46<00:59, 18.62it/s]

GL_TS_MO_2300091.nc  Is moving too much


 59%|█████▉    | 1477/2488 [01:55<01:17, 13.11it/s]

GL_TS_MO_23013.nc  Is moving too much


 62%|██████▏   | 1540/2488 [02:00<01:02, 15.14it/s]

GL_TS_MO_55038.nc  Is moving too much


  indexer = index.slice_indexer(
 63%|██████▎   | 1560/2488 [02:03<01:45,  8.78it/s]

Failed to timeslice for MO_TS_MO_ESTELLENCS.nc


 65%|██████▍   | 1605/2488 [02:05<00:42, 20.98it/s]

GL_TS_MO_2300454.nc  Is moving too much


 66%|██████▋   | 1652/2488 [02:09<00:53, 15.50it/s]

GL_TS_MO_56005.nc  Is moving too much


 69%|██████▉   | 1718/2488 [02:14<00:46, 16.73it/s]

GL_TS_MO_31003.nc  Is moving too much


 72%|███████▏  | 1793/2488 [02:19<00:47, 14.71it/s]

GL_TS_MO_4800400.nc  Is moving too much


 81%|████████  | 2010/2488 [02:35<00:28, 16.58it/s]

GL_TS_MO_52121.nc  Is moving too much


 85%|████████▍ | 2108/2488 [02:41<00:18, 20.24it/s]

GL_TS_MO_2300451.nc  Is moving too much


 88%|████████▊ | 2193/2488 [02:47<00:17, 16.41it/s]

GL_TS_MO_31005.nc  Is moving too much


 91%|█████████▏| 2272/2488 [02:53<00:13, 15.93it/s]

GL_TS_MO_52087.nc  Is moving too much


 92%|█████████▏| 2297/2488 [02:54<00:10, 18.95it/s]

GL_TS_MO_15002.nc  Is moving too much
Failed to timeslice for BS_TS_MO_VarnaBuoySURF.nc


  indexer = index.slice_indexer(
 95%|█████████▍| 2353/2488 [02:57<00:08, 16.70it/s]

GL_TS_MO_2300497.nc  Is moving too much


 95%|█████████▍| 2355/2488 [02:58<00:09, 14.77it/s]

GL_TS_MO_55024.nc  Is moving too much


  indexer = index.slice_indexer(


Failed to timeslice for BS_TS_MO_BurgasBuoySURF.nc


100%|██████████| 2488/2488 [03:08<00:00, 13.17it/s]


In [91]:
#df_data

In [94]:
df_files

Unnamed: 0_level_0,file,LONG,LAT,VARS,LEN
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,NO_TS_MO_FINO1.nc,6.583334,54.000000,{VHM0},17468
1,GL_TS_MO_T8N170W.nc,-170.000000,8.000000,{WSPD},20472
2,GL_TS_MO_41110.nc,-77.714981,34.141998,{VHM0},15453
3,GL_TS_MO_6200080.nc,-1.833600,45.916302,"{VAVH, VHM0}",11775
4,GL_TS_MO_46218.nc,-120.780014,34.451996,{VHM0},15593
...,...,...,...,...,...
398,GL_TS_MO_15006.nc,-10.000000,-6.000000,{WSPD},8655
399,NO_TS_MO_AWG.nc,5.950000,53.500000,"{VAVH, VHM0}",49626
400,GL_TS_MO_T0N180W.nc,180.000000,0.000000,{WSPD},20436
401,GL_TS_MO_T5S110W.nc,-110.000000,-5.000000,{WSPD},51018


In [97]:
#Conditionally creates the folder for the result
os.makedirs(write_folder, exist_ok=True)

#Save datadir and resulting dataframe
with open(os.path.join(write_folder, save_var_fn),'wb') as f_w:
    pickle.dump((data_dir, df_files), f_w)

#Create kml map
KML_fldr = KML.Folder(
    KML.name("All files"),
    KML.description(
        "Timeinterval: " + start_date + " to " + end_date + '\n' + str(variables)
    )
)

#iterate over all unique variable name, file name pairs
for index,row in df_files.iterrows():
    #extract longitude and latitude
    long = row['LONG']
    lat = row['LAT']
    
    #Create the pin
    pin = KML.Placemark(
        KML.name(row['LEN']),
        KML.description(row['file'] + '\n' + str(row['LEN']) + " datapoints"),
        KML.Point(
            KML.coordinates(str(long) + "," + str(lat))
        )
    )
    KML_fldr.append(pin)
    
with open(os.path.join(write_folder, kml_pinmap_fn + '.kml'), 'w') as f_w:
    f_w.write(lxml.etree.tostring(KML_fldr, pretty_print=True).decode())