### Import packages and set paths

In [None]:
import os

import dask.dataframe as dd
import dask_gateway
import dask.distributed

import dotenv
import warnings
warnings.filterwarnings('ignore')

import geopandas as gpd
import os
from shapely.geometry import Polygon, MultiPolygon
from shapely.ops import transform
import pandas as pd
import pickle
import pyarrow as pa
import pyproj

Set the path to the AIS data

In [None]:
#sets the path to load pre-processed ais data
folder_name = '2022_PoR'
path_name = 'abfs://ais/parquet/' + folder_name  

#sets the path to load other local data
current_directory = os.getcwd()
path = current_directory.split("\\01_Data_Analysis\\02_AIS_data")[0]

### Loads the access token (we use a SAS-token to protect the data)

In [None]:
# this is for environmental variables for secrets (needs python-dotenv)
# You can copy the  .env.example file and rename it to .env (one directory  up from the notebooks)
# 
%load_ext dotenv
# Load environment variables from the .env file 1 directory up
%dotenv -v

In [None]:
# read the environment variable from the .env file (we use an azure storage)
sas_token = dotenv.dotenv_values()['AZURE_BLOB_SAS_TOKEN']

In [None]:
#storage options to access the data
storage_options = {"account_name": "rwsais", "sas_token": sas_token}

### Creation of the cluster

In [None]:
#creates the calculation cluster
gateway = dask_gateway.Gateway()
cluster_options = gateway.cluster_options()
cluster = gateway.new_cluster(cluster_options)
cluster.adapt(minimum=1, maximum=100)
cluster

In [None]:
#provides access to the calculation cluster
client = dask.distributed.Client(cluster)
client

In [None]:
#Downloads packages to the workers of the calculation cluster
def worker_setup(dask_worker: dask.distributed.Worker):
    import os
    os.system("pip install -q movingpandas")  # or pip

client.register_worker_callbacks(worker_setup)

### Import geospatial data and creation of areas of interests and vessel types

In [None]:
#Creates transformation functions between spatial reference systems
utm = pyproj.CRS('EPSG:28992')
wgs84 = pyproj.CRS('EPSG:4326')
wgs_to_utm = pyproj.Transformer.from_crs(wgs84,utm,always_xy=True).transform
utm_to_wgs = pyproj.Transformer.from_crs(utm,wgs84,always_xy=True).transform

In [None]:
#Creates a dictionary with the geospatial areas of interest
areas_of_interest = {}
areas_of_interest['port_entrance'] = gpd.read_file(path+"\\00_Input_data\\01_Geospatial_data\\Port_Entrance.geojson")['geometry'][0]
areas_of_interest['berths'] = transform(utm_to_wgs,MultiPolygon(pickle.load(open(path+"\\00_Input_data\\01_Geospatial_data\\berths_PoR.pickle",'rb'))['geometry'].to_list()))

In [None]:
#Creates a dictionary with the vessel type codes as in AIS data
vessel_types = {}
vessel_types['unknown']= [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
vessel_types['wig_in_ground'] = [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
vessel_types['fishing'] = [30]
vessel_types['towing'] = [30, 31, 32]
vessel_types['dredging'] = [33]
vessel_types['diving'] = [34]
vessel_types['military'] = [35]
vessel_types['sailing'] = [36]
vessel_types['pleasure_craft'] = [37]
vessel_types['reserved'] = [38, 39]
vessel_types['high_speed_craft'] = [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
vessel_types['pilot'] = [50]
vessel_types['search_and_rescue'] = [51]
vessel_types['tug'] = [52]
vessel_types['port_tender'] = [53]
vessel_types['anti-pollution'] = [54]
vessel_types['law_enforcement'] = [55]
vessel_types['spare'] = [56, 57]
vessel_types['medical'] = [58]
vessel_types['noncombatant'] = [59]
vessel_types['passenger'] = [60, 61, 62, 63, 64, 65, 66, 67, 68, 69]
vessel_types['cargo'] = [70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
vessel_types['tanker'] = [80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
vessel_types['other'] = [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]

### Selection of vessels of interest based on vessel types and addition of areas of interest booleans to AIS data

In [None]:
#Functions
def to_datetime(df):
    """ 
    Function that transforms the datetime types of AIS data in a pandas dataframe into standard pandas datetimes 

    Parameters
    ----------
    df: pandas dataframe containing AIS data with a datetime type in a ['timestamplast'] column

    :returns: pandas dataframe containing AIS data
    """
    
    if str(df['timestamplast'].dtype).startswith('datetime'):
        return df
    # no dates yet
    # we have some cleaning up to do
    t = pd.to_datetime(df['timestamplast'], errors='coerce', utc=True)
    invalid_t = t.isna()
    
    numbers = pd.to_numeric(df['timestamplast'], errors='coerce')
    invalid_number = numbers.isna()
    
    # find the t's that we can fill in
    idx = invalid_t & ~invalid_number
    
    # fill in numbers
    t[idx] = pd.to_datetime(
        df['timestamplast'][idx].astype('double'), 
        unit='s',
        errors='coerce',
        utc=True
    )
    df['timestamplast'] = t
    return df

def select_vessel_types(df,vessel_types,selected_types):   
    """ 
    Function that selects the AIS data of vessels based on a selection of vessel types

    Parameters
    ----------
    df: pandas dataframe containing AIS data with vesseltype codes in a ['vesseltype'] column
    vessel_types: dictionary of vessel types with vessel type names as keys and vessel type codes as values
    selected_vessel_types: list of vessel type names

    :returns: pandas dataframe containing AIS data
    """
    
    selected_vessels = []
    for vessel_type in selected_types:
        selected_vessels.extend(vessel_types[vessel_type])
    
    selected_vessels_df = df[df['vesseltype'].isin(selected_vessels)]
    
    return selected_vessels_df

def create_gdf(df):
    """ 
    Function that creates a geopandas dataframe from a pandas dataframe with ['longitude'] and ['latitude']
    columns with values in WGS4326

    Parameters
    ----------
    df: pandas dataframe

    :returns:  geopandas dataframe
    """
    
    gdf = geopandas.GeoDataFrame(df,columns=df.columns,crs="EPSG:4326",geometry=geopandas.points_from_xy(df.longitude, df.latitude))
    return gdf
    
def vessel_in_areas_of_interest(df,areas_of_interest):
    """ 
    Function that determines whether a location of an AIS data message is within the areas of interest

    Parameters
    ----------
    df: pandas dataframe containing AIS data with longitudes and latitudes in WGS84 in ['longitude'] and 
        ['latitude'] columns, respectively, and vessel names in a ['name'] column
    areas_of_interest: dictionary with areas of interest names as names and shapely polygons as values

    :returns: pandas dataframe containing AIS data
    """
    
    gdf = create_gdf(df)
    for area in areas_of_interest:
        gdf[area] = gdf['geometry'].apply(areas_of_interest[area].intersects)
    
    selected_columns = ['name','vesseltype','hazardouscargo','length','width','draughtMarine','timestamplast','longitude','latitude','sog','heading']
    selected_columns.extend([area for area in areas_of_interest])
    
    df_selected = gdf[selected_columns]
    
    return df_selected

In [None]:
#Reads the data
ddf = dd.read_parquet(path_name, storage_options=storage_options)

In [None]:
#Performs the functions on the AIS data partitions and saves the data as parquet files in the Azure storage
ddf_i = ddf.partitions[:]
ddf_i = ddf_i.map_partitions(to_datetime)
ddf_i = ddf_i.map_partitions(select_vessel_types,vessel_types,selected_types=['cargo','tanker'])
ddf_i = ddf_i.map_partitions(vessel_in_areas_of_interest,areas_of_interest=areas_of_interest)
ddf_i = ddf_i.repartition(partition_size='10MB')
ddf_i.to_parquet(path_name+'/selected_vessel_types_with_geometry',storage_options={"account_name": "rwsais", "sas_token": sas_token},write_index=True,write_metadata_file=True)

### Selection of vessels of interest based on areas of interest

In [None]:
def vessels_through_area(df,areas_of_interest):
    """ 
    Function that isolates AIS messages of vessels that have sailed within the areas of interest

    Parameters
    ----------
    df: pandas dataframe containing AIS data with each area of interest as columns with a column name that
        corresponds with the names of the areas of interest, and vessel names in a ['name'] column
    areas_of_interest: dictionary with areas of interest names as names and shapely polygons as values
    
    :returns: pandas dataframe with AIS data messages of interest
    """
    
    selected_vessels = {}
    for area in areas_of_interest:
        selected_vessels[area] = list(set(df[df[area]]['name']))
    
    df_list_of_vessels = pd.DataFrame([selected_vessels.values()],columns = areas_of_interest.keys())
    
    return df_list_of_vessels

def create_list_of_selected_vessels(df,areas_of_interest):   
    """ 
    Function that selects the names of vessels that have passed all the areas of interest 

    Parameters
    ----------
    df: pandas dataframe containing AIS data with each area of interest as columns with a column name that
        corresponds with the names of the areas of interest, and vessel names in a ['name'] column
    areas_of_interest: dictionary with areas of interest names as names and shapely polygons as values

    :returns: list of vessel names of interest
    """
    
    selected_vessels = {}
    for area in areas_of_interest:    
        selected_vessels_for_area = []
        for _,list_of_vessels in df[[area]].iterrows():
            selected_vessels_for_area.extend(list_of_vessels.iloc[0])
        selected_vessels[area] = list(set(selected_vessels_for_area))

    for index,area in enumerate(areas_of_interest):
        if index == 0:
            vessel_selection = set(selected_vessels[area])
            continue
        vessel_selection = list(set(vessel_selection)&set(selected_vessels[area]))
        
    return vessel_selection

def select_vessels_of_interest(df,vessel_selection,areas_of_interest):
    """ 
    Function that selects the AIS data messages of vessels of interest

    Parameters
    ----------
    df: pandas dataframe containing AIS data with each area of interest as columns with a column name that
        corresponds with the names of the areas of interest, and vessel names in a ['name'] column
    vessel_selection: list of the names of vessels of interest
    areas_of_interest: dictionary with areas of interest names as names and shapely polygons as values

    :returns: pandas dataframe containing AIS data
    """
    df_selected_vessels = df[df.name.isin(vessel_selection)]

    for area in areas_of_interest.keys(): 
        df_selected_vessels = df_selected_vessels.drop(area,axis=1)
        
    return df_selected_vessels

In [None]:
#Reads AIS data
ddf = dd.read_parquet(path_name+'/selected_vessel_types_with_geometry',storage_options={"account_name": "rwsais", "sas_token": sas_token})

In [None]:
#Performs the functions on the AIS data partitions and saves the data as parquet files in the Azure storage
ddf_i = ddf.partitions[:]
ddf_ii = ddf_i.map_partitions(vessels_through_area,areas_of_interest=areas_of_interest)
df_list_of_vessels = ddf_ii.compute()
vessel_selection = create_list_of_selected_vessels(df_list_of_vessels,areas_of_interest)
ddf_i = ddf_i.map_partitions(select_vessels_of_interest,vessel_selection,areas_of_interest)
scheme_information = {'vesseltype': pa.int64(),
                      'hazardouscargo': pa.int64(),
                      'length': pa.float64(),
                      'width': pa.float64(),
                      'draughtMarine': pa.float64(),
                      'timestamplast': pa.timestamp('ns', tz='UTC'),
                      'longitude': pa.float64(),
                      'latitude': pa.float64(),
                      'heading': pa.float64()}
for datacolumn,datatype in scheme_information.items(): 
    ddf_i[datacolumn] = ddf_i[datacolumn].astype(datatype.to_pandas_dtype())
ddf_i = ddf_i.repartition(partition_size="10MB")
ddf_i.to_parquet(path_name+'/selected_vessels_for_further_analysis',storage_options={"account_name": "rwsais", "sas_token": sas_token},write_index=False,schema=scheme_information,write_metadata_file=True)