### Import packages and set paths

In [None]:
import os

import dask.dataframe as dd
import dask_gateway
import dask.distributed

import dotenv
import warnings
warnings.filterwarnings('ignore')

In [None]:
import matplotlib.pyplot as plt
import geopandas
from shapely.geometry import Polygon, LineString, Point, MultiPolygon
from shapely.ops import transform, cascaded_union
import numpy as np
import movingpandas as mpd
import datetime
import pandas as pd
import geopandas as gpd
import os
import pyproj
import scipy
import pyarrow as pa
import pickle

In [None]:
#sets the path to load pre-processed ais data
folder_name = '2022_PoR'
path_name = 'abfs://ais/parquet/' + folder_name  

#sets the path to load other local data
current_directory = os.getcwd()
path = current_directory.split("\\01_Data_Analysis\\02_AIS_data")[0]

### Loads the access token (we use a SAS-token to protect the data)

In [None]:
# this is for environmental variables for secrets (needs python-dotenv)
# You can copy the  .env.example file and rename it to .env (one directory  up from the notebooks)
# 
%load_ext dotenv
# Load environment variables from the .env file 1 directory up
%dotenv -v

In [None]:
# read the environment variable from the  .env file
sas_token = dotenv.dotenv_values()['AZURE_BLOB_SAS_TOKEN']

### Creation of the cluster with high worker memory

In [None]:
gateway = dask_gateway.Gateway()
cluster_options = gateway.cluster_options()
cluster_options.worker_memory = 32
cluster = gateway.new_cluster(cluster_options)
cluster.scale(n=5)
cluster

In [None]:
client = dask.distributed.Client(cluster)
client

In [None]:
def worker_setup(dask_worker: dask.distributed.Worker):
    import os
    os.system("pip install -q movingpandas")  # or pip
    os.system("pip install -q more-itertools")
    os.system("pip install -q dask")

client.register_worker_callbacks(worker_setup)

### Geospatial and vessel data

In [None]:
utm = pyproj.CRS('EPSG:28992')
wgs84 = pyproj.CRS('EPSG:4326')
wgs_to_utm = pyproj.Transformer.from_crs(wgs84,utm,always_xy=True).transform
utm_to_wgs = pyproj.Transformer.from_crs(utm,wgs84,always_xy=True).transform

In [None]:
anchorage_areas = gpd.read_file(path+"\\00_Input_data\\01_Geospatial_data\\anchorage_areas.geojson")
anchorage_areas['geometry'] = [Polygon(geom) for geom in anchorage_areas['geometry']] 

areas_of_interest = {}
areas_of_interest['port_entrance'] = gpd.read_file(path+"\\00_Input_data\\01_Geospatial_data\\Port_Entrance.geojson")['geometry'][0]
areas_of_interest['berths'] = transform(utm_to_wgs,MultiPolygon(pickle.load(open(path+"\\00_Input_data\\01_Geospatial_data\\berths_PoR.pickle",'rb'))['geometry'].to_list()))
areas_of_interest['anchorage_areas'] = cascaded_union(anchorage_areas['geometry'])

anchorage_areas['name'] = anchorage_areas['seamark:name']
anchorage_areas = anchorage_areas.set_index('name')
anchorage_areas = anchorage_areas[['geometry']]

berths = pickle.load(open(path+"\\00_Input_data\\01_Geospatial_data\\berths_PoR.pickle","rb"))
berths = berths.drop(columns='ZZHVAFKNAAM')

for loc,berth_info in berths.iterrows():
    berths.loc[loc,'geometry'] = transform(utm_to_wgs,berth_info.geometry.buffer(10))
for loc,anchorage_info in anchorage_areas.iterrows():
    anchorage_areas.loc[loc,'geometry'] = transform(utm_to_wgs,transform(wgs_to_utm,anchorage_info.geometry).buffer(250))

In [None]:
ddf = dd.read_parquet(path_name+'/ship_dataframe',storage_options={"account_name": "rwsais", "sas_token": sas_token})
ship_dataframe = ddf.compute()

### Divide partitions based on ship dataframe

In [None]:
ddf = dd.read_parquet(path_name+'/all_merged_sorted_trajectories_comprised', storage_options={"account_name": "rwsais", "sas_token": sas_token})
ddf_i = ddf.partitions[:]
ddf_i = ddf_i.set_index('name').repartition(divisions=sorted(list(ship_dataframe.index)))
scheme_information = {'name': pa.string(),
                      'departure': pa.timestamp('ns', tz='UTC'),
                      'arrival': pa.timestamp('ns', tz='UTC'),
                      'origin': pa.string(),
                      'destination': pa.string(),
                      'distance': pa.float64(),
                      'duration': pa.duration('ns'),
                      'draught': pa.float64(),
                      'geometry': pa.string(),
                      'times': pa.list_(pa.timestamp('us', tz='UTC')),
                      'coordinates': pa.list_(pa.string()),
                      'sog': pa.list_(pa.float64()),
                      'cog': pa.list_(pa.float64()),
                      'speed': pa.list_(pa.float64()),
                      'direction': pa.list_(pa.float64()),
                      'acceleration': pa.list_(pa.float64()),
                      'anchorage_areas': pa.bool_(),
                      'port_entrance': pa.bool_(),
                      'berths': pa.bool_()}
ddf_i.to_parquet(path_name+'/all_merged_sorted_trajectories_indexed', storage_options={"account_name": "rwsais", "sas_token": sas_token}, schema=scheme_information, engine='pyarrow')

### Closure of old cluster and creation of the cluster with default worker memory

In [None]:
cluster.close()
gateway = dask_gateway.Gateway()
cluster_options = gateway.cluster_options()
cluster = gateway.new_cluster(cluster_options)
cluster.adapt(minimum=1, maximum=100)
cluster

In [None]:
client = dask.distributed.Client(cluster)
client

In [None]:
def worker_setup(dask_worker: dask.distributed.Worker):
    import os
    os.system("pip install -q movingpandas")  # or pip
    os.system("pip install -q more-itertools")
    os.system("pip install -q dask")

client.register_worker_callbacks(worker_setup)

### Repartition of data in equally sized dataframes

In [None]:
ddf = dd.read_parquet(path_name+'/all_merged_sorted_trajectories_indexed',storage_options={"account_name": "rwsais", "sas_token": sas_token})

In [None]:
ddf_i = ddf.partitions[:]
ddf_i = ddf_i.repartition(partition_size='20MB')
scheme_information = {'name': pa.string(),
                      'departure': pa.timestamp('ns', tz='UTC'),
                      'arrival': pa.timestamp('ns', tz='UTC'),
                      'origin': pa.string(),
                      'destination': pa.string(),
                      'distance': pa.float64(),
                      'duration': pa.duration('ns'),
                      'draught': pa.float64(),
                      'geometry': pa.string(),
                      'times': pa.list_(pa.timestamp('us', tz='UTC')),
                      'coordinates': pa.list_(pa.string()),
                      'sog': pa.list_(pa.float64()),
                      'cog': pa.list_(pa.float64()),
                      'speed': pa.list_(pa.float64()),
                      'direction': pa.list_(pa.float64()),
                      'acceleration': pa.list_(pa.float64()),
                      'anchorage_areas': pa.bool_(),
                      'port_entrance': pa.bool_(),
                      'berths': pa.bool_()}
ddf_i.to_parquet(path_name+'/all_merged_sorted_trajectories_indexed_repartitioned',storage_options={"account_name": "rwsais", "sas_token": sas_token},schema=scheme_information,engine='pyarrow')

### Create trip dataframe

In [None]:
from shapely import wkt

def drop_index(df):
    """ 
    Function that deletes the index and resets it with a new order

    Parameters
    ----------
    df: pandas dataframe with AIS data

    :returns: pandas dataframe
    """
    
    df = df.reset_index(drop=False)
    return df
    
def renumber_index(df):
    """ 
    Function that sets the index as a column and resets the index with a new order

    Parameters
    ----------
    df: pandas dataframe with AIS data

    :returns: pandas dataframe
    """
    
    df = df.reset_index(drop=True)
    return df

def convert_string_geometry_to_shapely_geometry(df,geometry_columns):
    """ 
    Function that converts string geometry data to shapely geometries

    Parameters
    ----------
    df: pandas dataframe with AIS data
    geometry_columns: columns with geometry types as string data

    :returns: pandas dataframe
    """
    
    if df.empty or df['origin'].iloc[0] == 'a':
        return df
    for column in geometry_columns:
        df[column] = df[column].apply(wkt.loads)
    return df

def correct_ais_signal(df,max_sog,min_lon,max_lon,min_lat,max_lat):
    """ 
    Function that removes AIS signal that falls outside the area of interest or has a incorrect sog

    Parameters
    ----------
    df: pandas dataframe with AIS data
    max_sog: maximum allowable speed over ground
    min_lon: minimum required longitude
    max_lon: maximum allowable longitude
    min_lat: minimum required latitude
    max_lat: maximum allowable latitude

    :returns: pandas dataframe
    """
    
    if df['origin'].iloc[0] == 'a':
        return df
    for iloc,(loc,row_info) in enumerate(df.iterrows()):      
        new_list = False
        for index,(time,point,sog,cog,speed,direction,acceleration) in enumerate(zip(row_info.times,row_info.coordinates,row_info.sog,row_info.cog,row_info.speed,row_info.direction,row_info.acceleration)):
            lon,lat = point.x,point.y
            if sog < max_sog and min_lon < lon and lon < max_lon and min_lat < lat and lat < max_lat:
                if not new_list:
                    df.iloc[iloc,df.columns.get_loc('times')] = [time]
                    df.iloc[iloc,df.columns.get_loc('coordinates')] = [point]
                    df.iloc[iloc,df.columns.get_loc('sog')] = [sog]
                    df.iloc[iloc,df.columns.get_loc('cog')] = [cog]
                    df.iloc[iloc,df.columns.get_loc('speed')] = [speed]
                    df.iloc[iloc,df.columns.get_loc('direction')] = [direction]
                    df.iloc[iloc,df.columns.get_loc('acceleration')] = [acceleration]
                    new_list = True
                else:
                    df.iloc[iloc,df.columns.get_loc('times')].append(time)
                    df.iloc[iloc,df.columns.get_loc('coordinates')].append(point)
                    df.iloc[iloc,df.columns.get_loc('sog')].append(sog)
                    df.iloc[iloc,df.columns.get_loc('cog')].append(cog)
                    df.iloc[iloc,df.columns.get_loc('speed')].append(speed)
                    df.iloc[iloc,df.columns.get_loc('direction')].append(direction)
                    df.iloc[iloc,df.columns.get_loc('acceleration')].append(acceleration)      
    return df

def correct_origin_destination_areas(df,areas_of_interest):
    """ 
    Function that determines if an origin or destination is within an area of interest

    Parameters
    ----------
    df: pandas dataframe with AIS data
    areas_of_interest: dictionary with areas of interest names as names and shapely polygons as values

    :returns: pandas dataframe
    """
    
    if df['origin'].iloc[0] == 'a':
        return df
    for area in areas_of_interest:
        for index,info in df.iterrows():
            df.loc[index,area] = False
            if info.origin.intersects(areas_of_interest[area]) or info.destination.intersects(areas_of_interest[area]):
                df.loc[index,area] = True
    return df

def define_origins_and_destinations(df,areas_of_interest):  
    """ 
    Function that identifies false positives/negative of the area of interest of origins and destination and
    replaces it with a correct one

    Parameters
    ----------
    df: pandas dataframe with AIS data
    areas_of_interest: dictionary with areas of interest names as names and shapely polygons as values

    :returns: pandas dataframe
    """
    
    if df['origin'].iloc[0] == 'a':
        return df
    origins = df.origin
    destinations = df.destination
    for iloc,(loc,row_info) in enumerate(df.iterrows()): 
        for _,geometries in areas_of_interest.items():
            coord_area = []    
            for name,geometry in zip(geometries.index,geometries.geometry):
                if row_info.origin.intersects(geometry):  
                    df.loc[loc,'origin'] = name
                if row_info.destination.intersects(geometry): 
                    df.loc[loc,'destination'] = name
    return df

def correct_origins_and_destinations_new(df,max_velocity,max_time,max_distance): 
    """ 
    Function that determines if the name of the area of interest in which the origin or destination falls with

    Parameters
    ----------
    df: pandas dataframe with AIS data
    areas_of_interest: dictionary with areas of interest names as names and shapely polygons as values

    :returns: pandas dataframe
    """
    for index,name in enumerate(list(dict.fromkeys(df.name))):
        df_name = df[df.name == name]
    
        for (loc1,previous_track_info),(loc2,next_track_info) in zip(df_name.iloc[:-1].iterrows(),df_name.iloc[1:].iterrows()):
            if previous_track_info.destination == previous_track_info.origin:
                continue

            #Origin != Destination
            if not isinstance(next_track_info.origin,str) and isinstance(previous_track_info.destination,str):
                point_origin = next_track_info.origin
                point_destination = transform(wgs_to_utm,previous_track_info.coordinates[-1])
                distance = point_origin.distance(point_destination)
                time = next_track_info.departure-previous_track_info.arrival
                velocity = distance/(time/np.timedelta64(1,'s'))

                if velocity > max_velocity or distance > max_distance or time > max_time:
                    continue

                df.loc[loc2,'origin'] = previous_track_info.destination
                 
            elif not isinstance(previous_track_info.destination,str) and isinstance(next_track_info.origin,str):
                point_origin = transform(wgs_to_utm,next_track_info.coordinates[0])
                point_destination = previous_track_info.destination 
                distance = point_origin.distance(point_destination)
                time = next_track_info.departure-previous_track_info.arrival
                velocity = distance/(time/np.timedelta64(1,'s'))

                if velocity > max_velocity or distance > max_distance or time > max_time:
                    continue

                df.loc[loc1,'destination'] = next_track_info.origin

    return df


def correct_origins_and_destinations(df,areas_of_interest):   
    """ 
    Function that determines if the name of the area of interest in which the origin or destination falls with

    Parameters
    ----------
    df: pandas dataframe with AIS data
    areas_of_interest: dictionary with areas of interest names as names and shapely polygons as values

    :returns: pandas dataframe
    """
    
    knots = 0.5144444444
    if df['origin'].iloc[0] == 'a':
        return df
    new_df = pd.DataFrame(columns=df.columns)
    for index,name in enumerate(list(dict.fromkeys(df.name))):
        df_name = df[df.name == name]     
        remove_rows = []
        for iloc,(loc,row_info) in enumerate(df_name.iterrows()): 
            if np.mean(row_info.sog) > 4.5:
                continue
            
            for _,geometries in areas_of_interest.items():
                coord_area = []    
                for coord in row_info.coordinates:
                    for name,geometry in zip(geometries.index,geometries.geometry):
                        if coord.intersects(geometry):
                            coord_area.append(name)

            mode_coord_area = []
            if coord_area:
                mode_coord_area = pd.DataFrame(coord_area).mode().iloc[0][0]
            
            if len(coord_area) < 0.7*len(row_info.coordinates) and np.mean(row_info.sog) <= 0.3 and loc not in remove_rows:
                remove_rows.append(loc)  
            
            if row_info.origin != row_info.destination and (row_info.duration < pd.Timedelta(minutes = 6) or row_info.distance < 100) and loc not in remove_rows:
                remove_rows.append(loc) 
            
            if mode_coord_area and len(coord_area) > 0.7*len(row_info.coordinates):
                area = pd.DataFrame(coord_area)[0].mode()[0]
                if row_info.origin != row_info.destination and len([coord for coord in coord_area if coord == mode_coord_area]) > 0.5*len(coord_area) and np.mean(row_info.sog) < 0.5 and np.mean(row_info.speed) < 0.5*knots:      
                    if iloc-1 >= 0:
                        df_name.iloc[iloc-1, df_name.columns.get_loc('destination')] = area
                        df_name.iloc[iloc, df_name.columns.get_loc('destination')] = area
                        row_info.destination = area
                    if iloc+1 <= len(df_name)-1:
                        df_name.iloc[iloc+1, df_name.columns.get_loc('origin')] = area
                        df_name.iloc[iloc, df_name.columns.get_loc('origin')] = area
                        row_info.origin = area
                elif row_info.origin == row_info.destination:
                    if iloc-1 >= 0:
                        df_name.iloc[iloc-1, df_name.columns.get_loc('destination')] = area
                    if iloc+1 <= len(df_name)-1:
                        df_name.iloc[iloc+1, df_name.columns.get_loc('origin')] = area
                    if loc not in remove_rows:
                        remove_rows.append(loc)    
        
            if row_info.origin == row_info.destination and loc not in remove_rows:
                remove_rows.append(loc)
        
        for index in reversed(sorted(remove_rows)):
            df_name = df_name.drop(index)
        
        for iloc,(loc,row_info) in enumerate(df_name.iterrows()):
            if iloc == 0:
                continue

            if row_info.arrival-df_name.iloc[iloc-1].departure > pd.Timedelta(minutes=5):
                if iloc+1 <= len(df_name)-1 and type(row_info.destination) == str and row_info.destination != df_name.iloc[iloc+1].origin:
                    if row_info.sog[-1] < df_name.iloc[iloc+1].sog[0]:
                        df_name.iloc[iloc+1, df_name.columns.get_loc('origin')] = row_info.destination
                    else:
                        df_name.iloc[iloc, df_name.columns.get_loc('destination')] = df_name.iloc[iloc+1].origin
                if iloc-1 >= 0 and type(row_info.origin) == str and row_info.origin != df_name.iloc[iloc-1].destination:
                    if row_info.sog[0] < df_name.iloc[iloc-1].sog[-1]:
                        df_name.iloc[iloc-1, df_name.columns.get_loc('destination')] = row_info.origin    
                    else:
                        df_name.iloc[iloc, df_name.columns.get_loc('origin')] = df_name.iloc[iloc-1].destination  
                
        df_name = df_name.reset_index(drop=True)
        new_df = pd.concat([new_df,df_name])
    return new_df

def create_trips(df,areas_of_interest):
    """ 
    Function that constructs a trip dataframe based on identificaiton of inbound and outbound trips to and from
    and area of interest

    Parameters
    ----------
    df: pandas dataframe with AIS data
    areas_of_interest: dictionary with areas of interest names as names and shapely polygons as values

    :returns: pandas dataframe
    """
    
    if df['origin'].iloc[0] == 'a':
        return df
    trip_index = 0
    trip_df = pd.DataFrame()
    inbound_found = False
    for iloc,(loc,row_info) in enumerate(df.iterrows()):
        for area in areas_of_interest['berths'].index:    
            if row_info.origin == area:
                outbound_trip = row_info
                index = pd.MultiIndex.from_tuples([(outbound_trip['name'],trip_index)],names=['name','trip_id'])
                trip_df = pd.concat([trip_df,pd.DataFrame([outbound_trip[outbound_trip.keys().drop('name')]],index=index)])
                trip_index += 1
                inbound_found = False
                for area in areas_of_interest['berths'].index:
                    if row_info.destination == area and not inbound_found:
                        inbound_trip = row_info
                        index = pd.MultiIndex.from_tuples([(inbound_trip['name'],trip_index)],names=['name','trip_id'])
                        trip_df = pd.concat([trip_df,pd.DataFrame([inbound_trip[inbound_trip.keys().drop('name')]],index=index)])
                        inbound_found = True
                
            if row_info.destination == area and not inbound_found:
                inbound_trip = row_info
                for area in areas_of_interest['anchorage_areas'].index:
                    if iloc-1 >= 0 and row_info.origin == area:
                        previous_inbound_trip = df.iloc[iloc-1]
                        for area in areas_of_interest['anchorage_areas'].index:
                            if previous_inbound_trip.destination == area:
                                index = pd.MultiIndex.from_tuples([(previous_inbound_trip['name'],trip_index)],names=['name','trip_id'])
                                trip_df = pd.concat([trip_df,pd.DataFrame([previous_inbound_trip[previous_inbound_trip.keys().drop('name')]],index=index)])   
                                break
                index = pd.MultiIndex.from_tuples([(inbound_trip['name'],trip_index)],names=['name','trip_id'])
                trip_df = pd.concat([trip_df,pd.DataFrame([inbound_trip[inbound_trip.keys().drop('name')]],index=index)])
                inbound_found = True

    return trip_df
    
def change_data_format(df,data_columns):
    """ 
    Function that transforms the geometry data into string data in order to save it in dask

    Parameters
    ----------
    df: pandas dataframe with geometries
    data_columns: column names of geometries

    :returns: pandas dataframe
    """
    
    new_df = pd.DataFrame(columns=df.columns)
    for loc,info in df.iterrows():
        row_df = pd.DataFrame([info])
        for data_column in data_columns:
            if type(info[data_column]) == list or type(info[data_column]) == tuple:
                for index,data in enumerate(info[data_column]):
                    if not index:
                        row_df.loc[loc,data_column] = [str(data)]
                    else:
                        row_df.loc[loc,data_column].append(str(data))
            else:
                row_df.loc[loc,data_column] = str(info[data_column])
        new_df = pd.concat([new_df,row_df])
    return new_df

def multi_index_to_index(df,columns):
    """ 
    Function that transforms the multiindex of a dataframe with a singular index

    Parameters
    ----------
    df: pandas dataframe with geometries
    columns: list of a selection of column names

    :returns: pandas dataframe
    """
    
    df = df.reset_index(drop=False)
    new_df = pd.DataFrame(columns=columns)
    for loc,info in df.iterrows():
        df_info = pd.DataFrame([info])
        if isinstance(info['index'],tuple):
            df_info['name'] = info['index'][0]
            df_info['trip_id'] = info['index'][1]
        else:
            df_info['name'] = info['index']
            df_info['trip_id'] = info['index']
        new_df = pd.concat([new_df,df_info])
    if not df.empty:
        new_df = new_df.drop(['index'],axis=1)
    new_df['trip_id'] = new_df['trip_id'].astype(int)
    return new_df

def reorder_columns(df,columns):
    """ 
    Function that reorder the columns of a dataframe

    Parameters
    ----------
    df: pandas dataframe
    columns: list of a selection of ordered column names 

    :returns: pandas dataframe
    """
    df = df[columns]
    return df

def sort_values(df,columns):
    """ 
    Function that sorts a dataframe based on columns

    Parameters
    ----------
    df: pandas dataframe
    columns: list of column names

    :returns: pandas dataframe
    """
    
    df = df.sort_values(columns)
    return df

In [None]:
trip_df = pd.DataFrame(columns=['departure','arrival','origin','destination','distance','duration','draught','geometry','times','coordinates','sog','cog','speed','direction','acceleration','anchorage_areas','port_entrance','berths'])
ddf = dd.read_parquet(path_name+'/all_merged_sorted_trajectories_indexed_repartitioned',storage_options={"account_name": "rwsais", "sas_token": sas_token})

In [None]:
cluster.scale(n=5) #set cluster at 100 workers for more computation power
ddf_i = ddf.partitions[:]
ddf_i = ddf_i.map_partitions(drop_index)
ddf_i = ddf_i.map_partitions(renumber_index)
ddf_i = ddf_i.map_partitions(sort_values,columns=['name','arrival'])
ddf_i = ddf_i.map_partitions(convert_string_geometry_to_shapely_geometry,geometry_columns=['origin','destination','geometry','coordinates'])
ddf_i = ddf_i.map_partitions(correct_ais_signal,max_sog=25,min_lon=2.4,max_lon=4.75,min_lat=51.65,max_lat=52.3)
ddf_i = ddf_i.map_partitions(correct_origin_destination_areas,areas_of_interest={'berths': areas_of_interest['berths'],'anchorage_areas': areas_of_interest['anchorage_areas']})
ddf_i = ddf_i.map_partitions(define_origins_and_destinations,areas_of_interest={'anchorage_areas':anchorage_areas,'berths':berths})
ddf_i = ddf_i.map_partitions(correct_origins_and_destinations,areas_of_interest={'anchorage_areas':anchorage_areas,'berths':berths})
ddf_i = ddf_i.map_partitions(create_trips,areas_of_interest={'anchorage_areas':anchorage_areas,'berths':berths},meta=trip_df)
trip_df_new = pd.DataFrame(columns=['name','trip_id','departure','arrival','origin','destination','distance','duration','draught','geometry','times','coordinates','sog','cog','speed','direction','acceleration','anchorage_areas','port_entrance','berths'])
ddf_i = ddf_i.map_partitions(change_data_format,data_columns=['origin','destination','geometry','coordinates'])
ddf_i = ddf_i.map_partitions(multi_index_to_index,columns=trip_df_new.columns)
ddf_i = ddf_i.map_partitions(reorder_columns,columns=trip_df_new.columns)
scheme_information = {'name': pa.string(),
                      'trip_id': pa.int64(),
                      'departure': pa.timestamp('ns', tz='UTC'),
                      'arrival': pa.timestamp('ns', tz='UTC'),
                      'origin': pa.string(),
                      'destination': pa.string(),
                      'distance': pa.float64(),
                      'duration': pa.duration('ns'),
                      'draught': pa.float64(),
                      'geometry': pa.string(),
                      'times': pa.list_(pa.timestamp('us')),
                      'coordinates': pa.list_(pa.string()),
                      'sog': pa.list_(pa.float64()),
                      'cog': pa.list_(pa.float64()),
                      'speed': pa.list_(pa.float64()),
                      'direction': pa.list_(pa.float64()),
                      'acceleration': pa.list_(pa.float64()),
                      'anchorage_areas': pa.bool_(),
                      'port_entrance': pa.bool_(),
                      'berths': pa.bool_()}
ddf_i.to_parquet(path_name+'/trip_dataframe_with_outliers',storage_options={"account_name": "rwsais", "sas_token": sas_token},schema=scheme_information,engine='pyarrow')

In [None]:
cluster.close()

### Remove outliers

In [None]:
def restore_coordinates(df):
    """ 
    Function that restores errored coordinates

    Parameters
    ----------
    df: pandas dataframe with AIS data

    :returns: pandas dataframe
    """
    
    for loc,trip_info in df.iterrows():    
        if trip_info.coordinates[0] == '[':
            geom = trip_info.geometry
            geom = wkt.loads(geom)
            coords = np.empty(0)
            for x,y in geom.coords:
                coords = np.append(coords,Point(x,y))
            coords = coords.astype(str)
            df.at[loc,'coordinates'] = list(coords)
    return df

def outlier_remover(df,ship_dataframe):
    """ 
    Function that removes outliers of AIS data tracks based on the journal article of Abreu et al. (2021) entitled:
    "A trajectory scoring tool for local anomaly detection in maritime traffic using visual analytics"

    Parameters
    ----------
    df: pandas dataframe with AIS data
    ship_dataframe: pandas dataframe with unique vessel names and their static information

    :returns: pandas dataframe
    """
    
    if len(df) and df['origin'].iloc[0] == 'a':
        return df
    import movingpandas as mpd
    knots = 0.514444444
    wgs84 = pyproj.CRS('EPSG:4326')
    utm = pyproj.CRS('EPSG:32631')
    wgs84_to_utm = pyproj.Transformer.from_crs(wgs84, utm, always_xy=True).transform
    utm_to_wgs84 = pyproj.Transformer.from_crs(utm, wgs84, always_xy=True).transform
    new_df = pd.DataFrame(columns=df.columns)
    for loc,info in df.iterrows():
        drop_index = []
        filtered_df = pd.DataFrame([info])
        for index,coordinates in enumerate(info.coordinates):
            index_dropped = True
            new_index = index
            while index_dropped:  
                index_dropped = False
                new_index += 1
                if new_index == len(info.coordinates):
                    continue

                if index in drop_index:
                    continue        

                #Ship parameters
                L = ship_dataframe.loc[info['name']]['length']

                #Previous point
                t_old = info.times[index]
                lon_old = info.coordinates[index].x
                lat_old = info.coordinates[index].y
                sog_old = info.sog[index]*knots
                cog_old = info.cog[index]
                v_old = info.speed[index]
                phi_old = info.direction[index]

                #Next point
                t_new = info.times[new_index]
                lon_new = info.coordinates[new_index].x
                lat_new = info.coordinates[new_index].y
                sog_new = info.sog[new_index]*knots
                cog_new = info.cog[new_index]
                v_new = info.speed[new_index]
                phi_new = info.direction[new_index]

                #Derivations
                dlon = lon_new-lon_old
                dlat = lat_new-lat_old
                dsog = sog_new-sog_old
                dcog = (cog_new-cog_old + 180) % 360 - 180
                dv = v_new-v_old
                dphi = (phi_new-phi_old + 180) % 360 - 180
                dt = (t_new-t_old)/np.timedelta64(1, 's')
                dist = transform(wgs84_to_utm, LineString([Point(lon_new,lat_new),Point(lon_old,lat_old)])).length
                v_mean = np.mean([v_old,v_new])

                #Abnormal stop
                criteria = [dlon,dlat,dsog,dcog]   
                if v_old > 0 and criteria == list(np.zeros(4)):
                    drop_index.append(new_index)
                    index_dropped = True
                    continue

                #Acceleration criteria
                a = (v_new-v_old)/dt
                a_max =  (15*knots)**2/(20*L)
                a_min = -(15*knots)**2/(16*L)    
                if (a_max-a)*(a-a_min) < 0:
                    drop_index.append(new_index)
                    index_dropped = True
                    continue

                #Anomalous drift
                s_min = (v_mean-a_min*dt)*dt
                s_max = (v_mean+a_max*dt)*dt
                s_max = np.max([s_min,s_max])
                if dist > s_max:
                    drop_index.append(new_index)
                    index_dropped = True
                    continue

                #Anomalous turning point (only for vessels without tugs)
                dia = 3*L
                if v_old > 4 and dphi/dt > 360*v_old/(np.pi*dia): #use dphi (what is the orientation)
                    drop_index.append(new_index)
                    index_dropped = True
                    continue

        new_coordinates = list(info.coordinates)
        new_times = list(info.times)
        new_sog = list(info.sog)
        new_cog = list(info.cog)
        for dropped_index in list(reversed(drop_index)):
            new_coordinates.pop(dropped_index)
            new_times.pop(dropped_index) 
            new_sog.pop(dropped_index) 
            new_cog.pop(dropped_index) 
        filtered_df['coordinates'] = [new_coordinates]
        if len(new_coordinates) < 2:
            continue
        filtered_df['geometry'] = LineString(new_coordinates)
        filtered_df['times'] = [new_times]
        filtered_df['sog'] = [new_sog]
        filtered_df['cog'] = [new_cog]
        time_df = pd.DataFrame({'times':filtered_df.times.iloc[0]})
        x_df = pd.DataFrame({'x':[coordinate.x for coordinate in filtered_df.coordinates.iloc[0]]})
        y_df = pd.DataFrame({'y':[coordinate.y for coordinate in filtered_df.coordinates.iloc[0]]})
        trajectory_df = pd.concat([time_df,x_df,y_df],axis=1)
        traj = mpd.Trajectory(trajectory_df,traj_id='index',t='times',x='x',y='y')
        traj.add_speed()
        traj.add_direction()
        traj.add_acceleration()
        filtered_df['duration'] = traj.get_duration()
        filtered_df['distance'] = traj.get_length()
        filtered_df['speed'] = [list(traj.df['speed'])]
        filtered_df['direction'] = [list(traj.df['direction'])]
        filtered_df['acceleration'] = [list(traj.df['acceleration'])]
        new_df = pd.concat([new_df,filtered_df])
    return new_df

In [None]:
ddf = dd.read_parquet(path_name+'/trip_dataframe_with_outliers',storage_options={"account_name": "rwsais", "sas_token": sas_token})
ddf_i = ddf.partitions[:]
ddf_i = ddf_i.map_partitions(restore_coordinates)
ddf_i = ddf_i.map_partitions(convert_string_geometry_to_shapely_geometry,geometry_columns=['geometry','coordinates'])
ddf_i = ddf_i.map_partitions(outlier_remover,ship_dataframe=ship_dataframe)
ddf_i = ddf_i.map_partitions(change_data_format,data_columns=['geometry','coordinates'])
scheme_information = {'name': pa.string(),
                      'trip_id': pa.int64(),
                      'departure': pa.timestamp('ns', tz='UTC'),
                      'arrival': pa.timestamp('ns', tz='UTC'),
                      'origin': pa.string(),
                      'destination': pa.string(),
                      'distance': pa.float64(),
                      'duration': pa.duration('ns'),
                      'draught': pa.float64(),
                      'geometry': pa.string(),
                      'times': pa.list_(pa.timestamp('us')),
                      'coordinates': pa.list_(pa.string()),
                      'sog': pa.list_(pa.float64()),
                      'cog': pa.list_(pa.float64()),
                      'speed': pa.list_(pa.float64()),
                      'direction': pa.list_(pa.float64()),
                      'acceleration': pa.list_(pa.float64()),
                      'anchorage_areas': pa.bool_(),
                      'port_entrance': pa.bool_(),
                      'berths': pa.bool_()}
ddf_i.to_parquet(path_name+'/trip_dataframe',storage_options={"account_name": "rwsais", "sas_token": sas_token},schema=scheme_information,engine='pyarrow')