This notebook is used to export a list of bouys lying close to the shoreline.

This method uses shapelys buffer function and euqlidian distance functions and is only a rough approximation.

A better method would be to use geodesic or great-circle distance.

In [86]:
import cartopy
import shapely
from shapely.geometry import MultiPolygon, Polygon, Point, LineString
import geopy
import itertools
import numpy as np
import pandas as pd
import xarray as xr
import os
import pickle
from pykml.factory import KML_ElementMaker as KML
import lxml
from tqdm import tqdm

In [87]:
land_list = list(cartopy.feature.NaturalEarthFeature('physical', 'land', '50m').geometries())
polygon_list = []

for p  in land_list:
    if type(p) == MultiPolygon:
        polygon_list.extend(p.geoms)

    else:
        polygon_list.append(p)

land_multipolygon = MultiPolygon([p for p in polygon_list]).buffer(0.01)
if not land_multipolygon.is_valid:
    raise ValueError('Invalid MultiPolygon')


In [88]:
no_valid_parameters_type = 'A'
time_exception_type = 'B'
no_time_period_data_type = 'C'
no_valid_position_type = 'D'
shape_no_overlap_type = 'E'
shape_complete_overlap_type = 'F'
unique_coords_type = 'G'
bruteforce_type = 'H'

result_type_description = {
    no_valid_parameters_type:'No valid parameters',
    time_exception_type:'Exception during time filtering',
    no_time_period_data_type:'No data for the time period',
    no_valid_position_type: 'No valid positions',
    shape_no_overlap_type:'Simple shape does not overlap with land',
    shape_complete_overlap_type:'Simple shape is completely contained within the land area',
    unique_coords_type:'Unique coordinates overlap',
    bruteforce_type:'Solved with bruteforce',
}
result_type_description

{'A': 'No valid parameters',
 'B': 'Exception during time filtering',
 'C': 'No data for the time period',
 'D': 'No valid positions',
 'E': 'Simple shape does not overlap with land',
 'F': 'Simple shape is completely contained within the land area',
 'G': 'Unique coordinates overlap',
 'H': 'Solved with bruteforce'}

In [89]:
#Filters the dataframe for datapoint within the land_multipolygon
def shore_line_filter(df, land_multipolygon, long_col_name='LONG', lat_col_name='LAT', force_brutefoce=False):
    #distance considered close to shore (in degrees)
    close_dist = 0.00001

    if not force_brutefoce:
        unique_long = np.unique(df[long_col_name])
        n_unique_long = unique_long.shape[0]
        min_unique_long = unique_long.min()
        max_unique_long = unique_long.max()

        unique_lat = np.unique(df[lat_col_name])
        n_unique_lat = unique_lat.shape[0]
        min_unique_lat = unique_lat.min()
        max_unique_lat = unique_lat.max()
        
        #Try for the simple shape (point, line or rectangele) of the limit(s) of the bouy movement
        coord_points = list(itertools.product(set([min_unique_long, max_unique_long]), set([min_unique_lat, max_unique_lat])))
        if len(coord_points) == 1:
            limit_geo_obj = Point(coord_points)
        elif len(coord_points) == 2:
            limit_geo_obj = LineString(coord_points)
        elif len(coord_points) == 4:
            #Untangle the polygon itersection
            coord_points[2:4]=reversed(coord_points[2:4])
            #Form closed polygon by adding adding fist point as last
            coord_points.append(coord_points[0])
            limit_geo_obj = Polygon(coord_points)
        
        #If the limit_geo_obj does not overlap we we do not have to filter any datapoints
        if not limit_geo_obj.distance(land_multipolygon) <= close_dist:
            #print('No overlap with simple shape')
            return shape_no_overlap_type, df
        #if land_multipolygon contains the limit_geo_limit then all points should be removed
        elif land_multipolygon.contains(limit_geo_obj):
            #print('Complete overlap with simple shape')
            return shape_complete_overlap_type, df[0:0] #returning empty dataframe

        #Partial overlap
        else:
            unique_pos = df[[long_col_name, lat_col_name]].rename(columns={'long_col_name':'LONG', 'lat_col_name':'LAT'}).drop_duplicates()

            #if there are many repeated coordinates (more than 50%) we only inspect the distasnce for the unique coordinates
            #and use a lookup table to filter the whole dataframe
            if unique_pos.shape[0] < df.shape[0]*0.5:
                #print("Dist to unique combinatons only")
                unique_pos['keep'] = unique_pos.apply(lambda row: Point([row['LONG'], row['LAT']]).distance(land_multipolygon) > close_dist, axis=1)
                unique_pos = unique_pos.set_index(['LONG', 'LAT'])

                #Create dataframe filter by looking up the value of unique_pos for each row
                dist_filter = df[[long_col_name, lat_col_name]].apply(
                    lambda row: unique_pos.loc[tuple(row)]['keep'],
                    axis=1)
                
                return unique_coords_type, df[dist_filter]

    #Sove by bruteforce
    #print("Solving with bruteforce")
    dist_filter = df.apply(
        lambda row: Point([row[long_col_name], row[lat_col_name]]).distance(land_multipolygon) > close_dist,
        axis=1)

    return bruteforce_type, df[dist_filter]

In [90]:
#Loads the dataset and checks what proportion of the data is located close to shore
#it returns a tuple on the following format:
#return type, datapoints before filtering, proporiton of data close to shore, min and max longitude, min and max latitude values
def load_data(file_path, land_multipolygon):
    #Load the data from the file
    ds = xr.open_dataset(file_path)# , engine='scipy')
    
    if not any([c in ds.data_vars for c in ['VHM0', 'VAVH', 'WSPD']]):
        return tuple([no_valid_parameters_type] + [None]*9)
    
    #Add longitude, latidude and position_qc as variables indexed by time,depth as all other variables
    TIME = ds['TIME'].values
    DEPTH = ds['DEPTH'].values
    n_DEPTHS = len(DEPTH)

    dataset_columns = {
        'LONG':ds['LONGITUDE'],
        'LAT':ds['LATITUDE'],
        'POS_QC':ds['POSITION_QC'],
    }

    ds_pos = xr.Dataset(
        data_vars=
        {k:(
            ["TIME", 'DEPTH'],
            np.repeat(np.reshape(v.values, (-1,1)), n_DEPTHS, axis=1),
            v.attrs,
        )for (k,v) in dataset_columns.items()},
        coords=dict(
            TIME=TIME,
            DEPTH=DEPTH,
        )
    ).drop_vars('DEPTH')
    ds = xr.merge([ds.drop_dims(['LATITUDE', 'LONGITUDE', 'POSITION']), ds_pos])

    try:
        ds = ds.sel(TIME=slice("2021-01-01", '2021-12-31'))
    except Exception as e:
        print(file_path, e)
        return tuple([time_exception_type] + [None]*9)

    if ds['TIME'].size == 0:
        #print('Data for this timeinterval')
        return tuple([no_time_period_data_type, 0, 0] + [None]*7)
    
    #Qualty control filter
    pos_qc = ds['POS_QC'].values
    QC_good_pos = [1, 2, 5, 7, 8]
    pos_qc_filter = np.isin(pos_qc, QC_good_pos)
    datapoints = pos_qc_filter.sum()

    if datapoints == 0:
        #print('No good positions')
        return tuple([no_valid_position_type, 0, 0] + [None]*7)

    if (~pos_qc_filter).sum() != 0:
        pass
        #print("Some bad quality positions")

    #create dataframe of the coordinates
    #This is only done so we can develop the filter function using dataframes since this is what will be used later
    long = ds['LONG'].values[pos_qc_filter]
    lat = ds['LAT'].values[pos_qc_filter]

    pos_df = pd.concat([
        pd.DataFrame(long, columns=['LONG']),
        pd.DataFrame(lat, columns=['LAT']),
        ], axis=1)

    len_before_shore_filter = pos_df.shape[0]
    #Filter rows close to shoreline
    res_type, pos_df_shore = shore_line_filter(pos_df, land_multipolygon)
    len_after_shore_filter = pos_df_shore.shape[0]
    proportion_close_to_shore = (len_before_shore_filter-len_after_shore_filter) /  len_before_shore_filter

    #To have min and max coordinates for buyś that are filtered due to complete overlap
    if pos_df_shore.shape[0] == 0:
        pos_df_shore = pos_df

    min_long, max_long, mean_long = pos_df_shore['LONG'].min(), pos_df_shore['LONG'].max(), pos_df_shore['LONG'].mean()
    min_lat, max_lat, mean_lat = pos_df_shore['LAT'].min(), pos_df_shore['LAT'].max(), pos_df_shore['LAT'].mean()
    return_limits = (min_long, max_long, mean_long, min_lat, max_lat, mean_lat)

    return res_type, len_before_shore_filter, len_after_shore_filter, proportion_close_to_shore, *return_limits
    

In [91]:
data_dir = '/data/exjobb/sarssw/bouy/INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/MO'

result_df = pd.DataFrame({c: pd.Series(dtype=t) for c, t in {
    'file_name':str,
    'result_type':str,
    'data_before_landfilter':int,
    'data_after_landfilter':int,
    'proportion_close_to_shore':float,
    'min_long':float,
    'max_long':float,
    'mean_long':float,
    'min_lat':float,
    'max_lat':float,
    'mean_lat':float,
}.items()})

files = set(os.listdir(data_dir))
#files=['BO_TS_MO_HuvudskarOstWR.nc']


file_filter = [
    'GL_TS_MO_41121.nc', #Flips longitude sign in the middle of the data, from 66 to -66???! resutlts in asf search with over 7000 matches.
]
files = set(files).difference(file_filter)

for file in tqdm(list(files)):
    #print(file)
    file_path = os.path.join(data_dir, file)
    result = load_data(file_path, land_multipolygon)
    #pos_df_bf,pos_df = load_data(file_path, land_multipolygon)#TODO REMOVE
    #Append results to dataframe
    result_df = pd.concat([
        result_df,
        pd.DataFrame([[file,*result]], columns=result_df.columns)
    ])

  indexer = index.slice_indexer(
 11%|█         | 277/2487 [00:19<02:07, 17.27it/s]

/data/exjobb/sarssw/bouy/INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/MO/MO_TS_MO_ESTELLENCS.nc "cannot represent labeled-based slice indexer for coordinate 'TIME' with a slice over integer positions; the index is unsorted or non-unique"


  indexer = index.slice_indexer(
 18%|█▊        | 457/2487 [00:29<01:38, 20.60it/s]

/data/exjobb/sarssw/bouy/INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/MO/MO_TS_MO_VIDA.nc "cannot represent labeled-based slice indexer for coordinate 'TIME' with a slice over integer positions; the index is unsorted or non-unique"


  indexer = index.slice_indexer(
 23%|██▎       | 582/2487 [00:40<02:12, 14.37it/s]

/data/exjobb/sarssw/bouy/INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/MO/BS_TS_MO_VarnaBuoySURF.nc "cannot represent labeled-based slice indexer for coordinate 'TIME' with a slice over integer positions; the index is unsorted or non-unique"


  indexer = index.slice_indexer(
 24%|██▍       | 598/2487 [00:40<01:46, 17.77it/s]

/data/exjobb/sarssw/bouy/INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/MO/MO_TS_MO_NADR-S1.nc "cannot represent labeled-based slice indexer for coordinate 'TIME' with a slice over integer positions; the index is unsorted or non-unique"


  indexer = index.slice_indexer(
 63%|██████▎   | 1570/2487 [01:55<00:53, 17.06it/s]

/data/exjobb/sarssw/bouy/INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/MO/BS_TS_MO_BurgasBuoySURF.nc "cannot represent labeled-based slice indexer for coordinate 'TIME' with a slice over integer positions; the index is unsorted or non-unique"


  indexer = index.slice_indexer(
 67%|██████▋   | 1678/2487 [02:00<00:27, 29.01it/s]

/data/exjobb/sarssw/bouy/INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/MO/MO_TS_MO_Molo-Bandiera.nc "cannot represent labeled-based slice indexer for coordinate 'TIME' with a slice over integer positions; the index is unsorted or non-unique"


100%|██████████| 2487/2487 [03:02<00:00, 13.63it/s]


In [92]:
result_df

Unnamed: 0,file_name,result_type,data_before_landfilter,data_after_landfilter,proportion_close_to_shore,min_long,max_long,mean_long,min_lat,max_lat,mean_lat
0,GL_TS_MO_56014.nc,E,594,594,0.0,115.57,115.57,115.57,-32.450001,-32.450001,-32.450001
0,GL_TS_MO_45172.nc,C,0,0,,,,,,,
0,GL_TS_MO_3301541.nc,A,,,,,,,,,
0,GL_TS_MO_MEDS073.nc,C,0,0,,,,,,,
0,GL_TS_MO_GoodrichBank01.nc,E,8046,8046,0.0,129.999924,130.001877,130.001068,-10.41755,-10.41552,-10.416465
...,...,...,...,...,...,...,...,...,...,...,...
0,GL_TS_MO_45144.nc,C,0,0,,,,,,,
0,GL_TS_MO_55481.nc,E,1245,1245,0.0,154.449997,154.449997,154.449997,-27.65,-27.65,-27.649998
0,GL_TS_MO_46086.nc,E,234985,234985,0.0,-118.052002,-118.052002,-118.052002,32.499001,32.499001,32.499008
0,GL_TS_MO_44174.nc,C,0,0,,,,,,,


In [93]:
for t, description in result_type_description.items():
    print(t, ':', description)
    display(result_df[result_df['result_type'] == t])

A : No valid parameters


Unnamed: 0,file_name,result_type,data_before_landfilter,data_after_landfilter,proportion_close_to_shore,min_long,max_long,mean_long,min_lat,max_lat,mean_lat
0,GL_TS_MO_3301541.nc,A,,,,,,,,,
0,GL_WS_MO_45137.nc,A,,,,,,,,,
0,GL_TS_MO_IF000619.nc,A,,,,,,,,,
0,GL_TS_MO_7100061.nc,A,,,,,,,,,
0,AR_TS_MO_AmundsenGulf-AT2.nc,A,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
0,GL_WS_MO_46146.nc,A,,,,,,,,,
0,GL_WS_MO_46023.nc,A,,,,,,,,,
0,GL_TS_MO_53960.nc,A,,,,,,,,,
0,AR_TS_MO_BSO2B.nc,A,,,,,,,,,


B : Exception during time filtering


Unnamed: 0,file_name,result_type,data_before_landfilter,data_after_landfilter,proportion_close_to_shore,min_long,max_long,mean_long,min_lat,max_lat,mean_lat
0,MO_TS_MO_ESTELLENCS.nc,B,,,,,,,,,
0,MO_TS_MO_VIDA.nc,B,,,,,,,,,
0,BS_TS_MO_VarnaBuoySURF.nc,B,,,,,,,,,
0,MO_TS_MO_NADR-S1.nc,B,,,,,,,,,
0,BS_TS_MO_BurgasBuoySURF.nc,B,,,,,,,,,
0,MO_TS_MO_Molo-Bandiera.nc,B,,,,,,,,,


C : No data for the time period


Unnamed: 0,file_name,result_type,data_before_landfilter,data_after_landfilter,proportion_close_to_shore,min_long,max_long,mean_long,min_lat,max_lat,mean_lat
0,GL_TS_MO_45172.nc,C,0,0,,,,,,,
0,GL_TS_MO_MEDS073.nc,C,0,0,,,,,,,
0,GL_TS_MO_41026.nc,C,0,0,,,,,,,
0,GL_TS_MO_45018.nc,C,0,0,,,,,,,
0,GL_TS_MO_MEDS091.nc,C,0,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
0,GL_TS_MO_14042.nc,C,0,0,,,,,,,
0,GL_TS_MO_51005.nc,C,0,0,,,,,,,
0,GL_TS_MO_45140.nc,C,0,0,,,,,,,
0,GL_TS_MO_45144.nc,C,0,0,,,,,,,


D : No valid positions


Unnamed: 0,file_name,result_type,data_before_landfilter,data_after_landfilter,proportion_close_to_shore,min_long,max_long,mean_long,min_lat,max_lat,mean_lat


E : Simple shape does not overlap with land


Unnamed: 0,file_name,result_type,data_before_landfilter,data_after_landfilter,proportion_close_to_shore,min_long,max_long,mean_long,min_lat,max_lat,mean_lat
0,GL_TS_MO_56014.nc,E,594,594,0.0,115.57,115.57,115.57,-32.450001,-32.450001,-32.450001
0,GL_TS_MO_GoodrichBank01.nc,E,8046,8046,0.0,129.999924,130.001877,130.001068,-10.41755,-10.41552,-10.416465
0,GL_TS_MO_46001.nc,E,34345,34345,0.0,-147.949005,-147.919998,-147.948975,56.231998,56.304001,56.232044
0,NO_TS_MO_AWG.nc,E,49754,49754,0.0,5.95,5.95,5.95,53.5,53.5,53.5
0,BO_TS_MO_Koster.nc,E,355600,355600,0.0,11.0964,11.1005,11.097997,58.882099,58.8839,58.883144
...,...,...,...,...,...,...,...,...,...,...,...
0,AR_TS_MO_Kogurdufl.nc,E,8743,8743,0.0,-13.6265,-13.6265,-13.626499,65.64817,65.64817,65.648178
0,GL_TS_MO_51211.nc,E,30974,30974,0.0,-157.959,-157.959,-157.959,21.297001,21.297001,21.296999
0,GL_TS_MO_2300492.nc,E,17208,17208,0.0,72.193001,72.209999,72.209816,10.867,10.882,10.873935
0,GL_TS_MO_55481.nc,E,1245,1245,0.0,154.449997,154.449997,154.449997,-27.65,-27.65,-27.649998


F : Simple shape is completely contained within the land area


Unnamed: 0,file_name,result_type,data_before_landfilter,data_after_landfilter,proportion_close_to_shore,min_long,max_long,mean_long,min_lat,max_lat,mean_lat
0,GL_TS_MO_45195.nc,F,15164,0,1.0,-73.338997,-73.338997,-73.338997,44.487999,44.487999,44.487999
0,IR_TS_MO_Tenerife-coast-buoy.nc,F,17346,0,1.0,-16.24,-16.23,-16.23978,28.459999,28.459999,28.459993
0,NO_TS_MO_SchiermonnikoogWaggen.nc,F,14822,0,1.0,6.203462,6.203462,6.203461,53.4701,53.4701,53.470097
0,GL_TS_MO_45141.nc,F,3299,0,1.0,-115.315002,-115.315002,-115.31498,61.18,61.18,61.179996
0,GL_TS_MO_45029.nc,F,73206,0,1.0,-86.272003,-86.272003,-86.27198,42.900002,42.900002,42.900002
...,...,...,...,...,...,...,...,...,...,...,...
0,GL_TS_MO_45028.nc,F,72740,0,1.0,-91.829002,-91.829002,-91.828987,46.813999,46.813999,46.814003
0,GL_TS_MO_45187.nc,F,106295,0,1.0,-87.778999,-87.778999,-87.778999,42.491001,42.491001,42.491001
0,NO_TS_MO_NieuwpoortWind.nc,F,51223,0,1.0,2.716667,2.716667,2.716667,51.155556,51.155556,51.15556
0,GL_TS_MO_ljpc1.nc,F,17080,0,1.0,-117.257004,-117.257004,-117.256989,32.867001,32.867001,32.867001


G : Unique coordinates overlap


Unnamed: 0,file_name,result_type,data_before_landfilter,data_after_landfilter,proportion_close_to_shore,min_long,max_long,mean_long,min_lat,max_lat,mean_lat
0,GL_TS_MO_23015.nc,G,4176,4176,0.0,-149.177994,67.046997,66.182121,-36.778,0.475,0.174621
0,GL_TS_MO_55038.nc,G,30160,22598,0.250729,151.5,151.50238,151.501221,-23.9,-23.895611,-23.897774
0,GL_TS_MO_55037.nc,G,26340,26340,0.0,153.580002,153.949997,153.758026,-28.280001,-28.18,-28.228113
0,GL_TS_MO_55017.nc,G,5961,5961,0.0,153.470001,153.699997,153.679733,-29.120001,-28.870001,-28.870674
0,GL_TS_MO_55045.nc,G,27812,6188,0.777506,153.25,153.25,153.25,-27.4,-27.4,-27.399996
0,GL_TS_MO_55018.nc,G,5720,5720,0.0,153.229996,153.5,153.278259,-30.370001,-29.82,-30.367571
0,GL_TS_MO_55022.nc,G,3051,3051,0.0,151.020004,151.399994,151.020264,-34.48,-33.779999,-34.471169
0,GL_TS_MO_55046.nc,G,19682,19682,0.0,152.5,152.820007,152.640457,-25.1,-24.67,-24.858723
0,IR_TS_MO_1300130.nc,G,24780,24771,0.000363,-15.8087,-15.7959,-15.804513,28.187,28.2017,28.192307
0,IR_TS_MO_Melilla-coast-buoy.nc,G,16848,15156,0.100427,-2.944,-2.94,-2.940188,35.32,35.327,35.320335


H : Solved with bruteforce


Unnamed: 0,file_name,result_type,data_before_landfilter,data_after_landfilter,proportion_close_to_shore,min_long,max_long,mean_long,min_lat,max_lat,mean_lat


In [94]:
write_folder = './shoreline_filter'
result_df_fn = 'result_df'
#Conditionally creates the folder for the result
os.makedirs(write_folder, exist_ok=True)

#Save datadir and resulting dataframe
with open(os.path.join(write_folder, result_df_fn),'wb') as f_w:
    pickle.dump((result_df, result_type_description), f_w)

In [97]:
kml_pinmap_fn = 'kml_pinmap'

#Create 2 kml pinmaps
#1: for bouys with valid data in valid positions
#2: bouys with only data from invalid positions

df_files_keep = result_df[result_df['data_after_landfilter'] > 0]
df_files_remove = result_df[result_df['proportion_close_to_shore'] == 1]


for df_files, name in [(df_files_keep, 'kept'),(df_files_remove, 'removed')]:
    #Create kml map
    KML_fldr = KML.Folder(
        KML.name('All bouys ' + name),
        KML.description(str(df_files.shape[0]) + " files\n" + 
                        "For files with data for 2021\nFiltered by distance to shore"),
    )

    for index,row in df_files.iterrows():
        #Create the pin
        pin = KML.Placemark(
            KML.name(row['file_name']),
            KML.description(
                '\n data before landfilter ' + str(row['data_before_landfilter']) +
                "\n data before after landfilter" + str(row['data_after_landfilter']) +
                "\n proportion close to shore " + str(row['proportion_close_to_shore'])),
            KML.Point(
                KML.coordinates(str(row['mean_long']) + "," + str(row['mean_lat']))
            )
        )
        KML_fldr.append(pin)
        
    with open(os.path.join(write_folder, kml_pinmap_fn + '_' + name + '.kml'), 'w') as f_w:
        f_w.write(lxml.etree.tostring(KML_fldr, pretty_print=True).decode())
