### Import packages and set paths

In [None]:
import os

import dask.dataframe as dd
import dask_gateway
import dask.distributed

import dotenv
import warnings
warnings.filterwarnings('ignore')

In [None]:
import matplotlib.pyplot as plt
import geopandas
from shapely.geometry import Polygon, LineString, Point, MultiPolygon
from shapely.ops import transform, cascaded_union
from shapely import wkt
import numpy as np
import movingpandas as mpd
import datetime
import pandas as pd
import geopandas as gpd
import os
import pyproj
import scipy
import pyarrow as pa
import pickle

In [None]:
#sets the path to load pre-processed ais data
folder_name = '2022_PoR'
path_name = 'abfs://ais/parquet/' + folder_name  

#sets the path to load other local data
current_directory = os.getcwd()
path = current_directory.split("\\01_Data_Analysis\\02_AIS_data")[0]

### Loads the access token (we use a SAS-token to protect the data)

In [None]:
# this is for environmental variables for secrets (needs python-dotenv)
# You can copy the  .env.example file and rename it to .env (one directory  up from the notebooks)
# 
%load_ext dotenv
# Load environment variables from the .env file 1 directory up
%dotenv -v

In [None]:
# read the environment variable from the  .env file
sas_token = dotenv.dotenv_values()['AZURE_BLOB_SAS_TOKEN']

### Creation of the cluster with high worker memory

In [None]:
gateway = dask_gateway.Gateway()
cluster_options = gateway.cluster_options()
cluster = gateway.new_cluster(cluster_options)
cluster.adapt(minimum=1, maximum=100)
cluster

In [None]:
client = dask.distributed.Client(cluster)
client

In [None]:
def worker_setup(dask_worker: dask.distributed.Worker):
    import os
    os.system("pip install -q movingpandas")  # or pip
    os.system("pip install -q more-itertools")
    os.system("pip install -q dask")

client.register_worker_callbacks(worker_setup)

### Geospatial and vessel data

In [None]:
utm = pyproj.CRS('EPSG:28992')
wgs84 = pyproj.CRS('EPSG:4326')
wgs_to_utm = pyproj.Transformer.from_crs(wgs84,utm,always_xy=True).transform
utm_to_wgs = pyproj.Transformer.from_crs(utm,wgs84,always_xy=True).transform

In [None]:
anchorage_areas = gpd.read_file(path+"\\00_Input_data\\01_Geospatial_data\\anchorage_areas.geojson")
anchorage_areas['geometry'] = [Polygon(geom) for geom in anchorage_areas['geometry']] 
turning_basins = pickle.load(open(path+"\\00_Input_data\\01_Geospatial_data\\turning_basins_PoR.pickle",'rb'))

areas_of_interest = {}
areas_of_interest['port_entrance'] = gpd.read_file(path+"\\00_Input_data\\01_Geospatial_data\\Port_Entrance.geojson")['geometry'][0]
areas_of_interest['berths'] = transform(utm_to_wgs,MultiPolygon(pickle.load(open(path+"\\00_Input_data\\01_Geospatial_data\\berths_PoR.pickle",'rb'))['geometry'].to_list()))
areas_of_interest['anchorage_areas'] = cascaded_union(anchorage_areas['geometry'])

In [None]:
harbour_basins = pickle.load(path+"\\00_Input_data\\01_Geospatial_data\\harbour_basins_PoR.pickle",'rb'))
berths = pickle.load(open(path+"\\00_Input_data\\01_Geospatial_data\\selected_berths_PoR.pickle",'rb'))

In [None]:
berths.Harbour_basin = ['Waalhaven' if str(name).find('Waalhaven')+1 else name for name in berths.Harbour_basin]
berths.loc[berths[berths.Harbour_basin == 'IJselhaven'].index,'Harbour_basin'] = 'IJsselhaven'
berths.loc[berths[berths.Harbour_basin == 'Scheur'].index,'Harbour_basin'] = 'Scheurkade'
berths.loc[berths[berths.index == 'NIEUWE MAAS HBR HOLLAND AMERIKAKADE'].index,'Harbour_basin'] = 'Holland Amerika Kade'
berths.loc[berths[(berths.Terminal == 'VOPAK')&(berths.Harbour_basin == 'Nieuwe Maas')].index,'Harbour_basin'] = 'VOPAK'
berths.loc[berths[(berths.Terminal == 'NESTE')&(berths.Harbour_basin == 'Nieuwe Maas')].index,'Harbour_basin'] = 'Neste'
berths.loc[berths[(berths.Terminal == 'KTM')&(berths.Harbour_basin == 'Nieuwe maas')].index,'Harbour_basin'] = 'Koole Kade'
berths.loc[berths[berths.Harbour_basin == '3e Petroleumhaven'].index,'Harbour_basin'] = 'Botlek'
berths.loc[berths[berths.Harbour_basin == 'Torontohaven'].index,'Harbour_basin'] = 'Botlek'
berths.loc[berths[berths.Harbour_basin == 'Chemiehaven'].index,'Harbour_basin'] = 'Botlek'
berths.loc[berths[berths.Harbour_basin == '1e Werkhaven'].index,'Harbour_basin'] = 'Botlek'
berths.loc[berths[berths.Harbour_basin == '2e Werkhaven'].index,'Harbour_basin'] = 'Botlek'
berths.loc[berths[berths.Harbour_basin == 'Sint -Laurenshaven'].index,'Harbour_basin'] = 'Botlek'
berths.loc[berths[berths.Harbour_basin == 'Prins Willem-Alexanderhaven'].index,'Harbour_basin'] = 'Eemhaven'
berths.loc[berths[berths.Harbour_basin == 'Prins Johan Frisohaven'].index,'Harbour_basin'] = 'Eemhaven'
berths.loc[berths[berths.Harbour_basin == 'Prinses Beatrixhaven'].index,'Harbour_basin'] = 'Eemhaven'
berths = berths[berths.Harbour_basin != 'Koggehaven']
berths = berths[berths.index != 'OUDE MAAS HBR KADE']
berths = berths[berths.Harbour_basin != 'Zevenmanshaven']

In [None]:
harbour_basins.Geometry = harbour_basins.Geometry.apply(lambda x: transform(utm_to_wgs,x))
berths.geometry = berths.geometry.apply(lambda x: transform(utm_to_wgs,x))
turning_basins.geometry = turning_basins.geometry.apply(lambda x: transform(utm_to_wgs,x))
harbour_basins = harbour_basins.rename(columns={'Geometry':'geometry'})
turning_basins.index = turning_basins.index.astype(str)

In [None]:
for loc,basin_info in harbour_basins.iterrows():
    if isinstance(basin_info.geometry,MultiPolygon):
        harbour_basins.loc[loc,'geometry'] = basin_info.geometry.convex_hull

In [None]:
with open(path+"\\03_Simulation\\01_Input_data\\01_Geospatial_data\\network\\PoR_graph_with_information.pickle", 'rb') as f:
    FG = pickle.load(f)
origin_name = '8866969'
origin_node = FG.nodes[origin_name]['geometry']
origin_edge = FG.edges[origin_name,'8866305',0]['geometry']
origin_edge = transform(wgs_to_utm,origin_edge)

cd_length = 1000
left = origin_edge.parallel_offset(cd_length / 2, 'left')
right = origin_edge.parallel_offset(cd_length / 2, 'right')
perp_left = left.boundary.geoms[1]
perp_right = right.boundary.geoms[1]
port_entrance_transect = transform(utm_to_wgs,LineString([perp_left, perp_right]))

In [None]:
ddf = dd.read_parquet(path_name+'/ship_dataframe',storage_options={"account_name": "rwsais", "sas_token": sas_token})
ship_dataframe = ddf.compute()
ship_dataframe.loc['testschip-30642','length'] = ship_dataframe.loc['testschip-30642','length']/10
ship_dataframe.loc['testschip-8325','length'] = ship_dataframe.loc['testschip-8325','length']/10

### Create voyage dataframe

In [None]:
def convert_string_geometry_to_shapely_geometry(df,geometry_columns):
    """ 
    Function that converts string geometry data to shapely geometries

    Parameters
    ----------
    df: pandas dataframe with trips
    geometry_columns: columns with geometry types as string data

    :returns: pandas dataframe
    """
    
    if df.empty or df['origin'].iloc[0] == 'a':
        return df
    for column in geometry_columns:
        df[column] = df[column].apply(wkt.loads)
    return df

def add_bounds(df):
    """ 
    Determines if the trip is inbound (to anchorage or to terminal) and outbound

    Parameters
    ----------
    df: pandas dataframe with trips

    :returns: pandas dataframe
    """
    
    df['bound'] = np.NaN 
    new_df = pd.DataFrame(columns=df.columns)
    for ship_name in list(dict.fromkeys(df.name)):
        ship_df = df[df.name == ship_name]
        for trip_index in list(dict.fromkeys(ship_df.trip_id)):
            trip_df = ship_df[ship_df.trip_id == trip_index]
            trip_df.iloc[0,trip_df.columns.get_loc('bound')] = 'to_terminal'
            if len(trip_df) > 2:
                trip_df.iloc[0,trip_df.columns.get_loc('bound')] = 'to_anchorage'
                trip_df.iloc[1,trip_df.columns.get_loc('bound')] = 'to_terminal' 
            trip_df.iloc[-1,trip_df.columns.get_loc('bound')] = 'from_terminal'
            if len(trip_df) == 1:
                berth_at_origin = trip_df.origin.iloc[0] in list(berths.index)
                berth_at_departure = trip_df.destination.iloc[0] in list(berths.index)
                if berth_at_origin:
                    trip_df.iloc[0,trip_df.columns.get_loc('bound')] = 'from_terminal'
                elif berth_at_departure:
                    trip_df.iloc[0,trip_df.columns.get_loc('bound')] = 'to_terminal'
            
            new_df = pd.concat([new_df,trip_df])
    return new_df

def determine_location_of_turning(inbound_df,outbound_df):
    """ 
    Function that determines over which trip (inbound/outbound), which turning basin, and for what duration 
    a vessel is turning

    Parameters
    ----------
    inbound_df: pandas dataframe with the inbound trip
    outbound_df: pandas dataframe with the outbound trip

    Returns
    -------
    turning_basin_at_arrival: if applicable, the turning basin name that was used during arrival 
    turning_basin_at_departure: if applicable, the turning basin name that was used during departure
    arrival_at_turning_basin: the time at which the vessel started turning
    departure_from_turning_basin: the time at which the vessel stopped turning
    sailing_distance_to_turning_basin: the distance to the turning basin from the origin of the vessel trip
    sailing_distance_from_turning_basin: the distance from the turning basin to the destination of the vessel trip
    """
    
    import more_itertools as mit
    turning_basin_at_arrival = ''
    turning_basin_at_departure = ''
    arrival_at_turning_basin = pd.Timestamp('NaT')
    departure_from_turning_basin = pd.Timestamp('NaT')
    sailing_distance_to_turning_basin = np.NaN
    sailing_distance_from_turning_basin = np.NaN
    turning_times_in_turning_basin = {'to_terminal':{},'from_terminal':{}}
    time_bounds_in_turning_basin = {'to_terminal':{},'from_terminal':{}}

    for index,(bound,df) in enumerate(zip(['to_terminal','from_terminal'],[inbound_df,outbound_df])):
        if len(df):
            passed_turning_basins = turning_basins[[df.geometry.iloc[0].intersects(basin_info.geometry) for _,basin_info in turning_basins.iterrows()]]
            for name,basin_info in passed_turning_basins.iterrows():
                coordinates = df.coordinates.iloc[0]
                times = df.times.iloc[0]
                mask = np.array([Point(coord).intersects(basin_info.geometry) for coord in df.geometry.iloc[0].coords], dtype=bool)
                indices = [i for i, x in enumerate(mask) if x == True]
                if indices:
                    new_mask = [list(group) for group in mit.consecutive_groups(indices)]
                    new_mask = new_mask[np.argmax([len(series) for series in new_mask])]
                    time = [x for i, x in enumerate(times) if i in new_mask]
                    if time:
                        turning_time = time[-1]-time[0]
                        turning_times_in_turning_basin[bound][name] = turning_time.item().total_seconds()
                        time_bounds_in_turning_basin[bound][name] = [time[-1],time[0]]

    if turning_times_in_turning_basin['to_terminal'].values() or turning_times_in_turning_basin['from_terminal'].values():
        maxima = [max(turning_times_in_turning_basin[bound].values()) if turning_times_in_turning_basin[bound].values() else -1 for bound in turning_times_in_turning_basin.keys()]
        turning_bound = list(time_bounds_in_turning_basin.keys())[np.argmax(maxima)]
        turning_basin = max(turning_times_in_turning_basin[turning_bound], key=turning_times_in_turning_basin[turning_bound].get)
        turning_times = time_bounds_in_turning_basin[turning_bound][turning_basin]
        arrival_at_turning_basin = turning_times[-1]
        departure_from_turning_basin = turning_times[0]
        if turning_bound == 'to_terminal':
            geometry_arrival = inbound_df.geometry.iloc[0]
            turning_basin_at_arrival = turning_basin
            times = list(inbound_df.times.iloc[0])
            track_to_turning_basin = [Point(coord) for coord in geometry_arrival.coords][0:times.index(turning_times[-1])]
            sailing_distance_to_turning_basin = 0
            if len(track_to_turning_basin) > 1:
                track_to_turning_basin = LineString(track_to_turning_basin)
                sailing_distance_to_turning_basin = transform(wgs_to_utm,track_to_turning_basin).length
        elif turning_bound == 'from_terminal':
            geometry_departure = outbound_df.geometry.iloc[0]
            turning_basin_at_departure = turning_basin    
            times = list(outbound_df.times.iloc[0])
            track_from_turning_basin = [Point(coord) for coord in geometry_departure.coords][times.index(turning_times[-1]):]
            sailing_distance_from_turning_basin = 0
            if len(track_from_turning_basin) > 1:
                track_from_turning_basin = LineString(track_from_turning_basin)
                sailing_distance_from_turning_basin = transform(wgs_to_utm,geometry_departure).length - transform(wgs_to_utm,track_from_turning_basin).length

    return turning_basin_at_arrival,turning_basin_at_departure,arrival_at_turning_basin,departure_from_turning_basin,sailing_distance_to_turning_basin,sailing_distance_from_turning_basin

def find_time_and_location_of_crossing(df,area_geometry,mode):
    """ 
    Function that finds the location and timing of a vessel crossing a linestring over its route

    Parameters
    ----------
    df: pandas dataframe with trips
    area_geometry: LineString geometry of the geometry of interest
    mode: 'arrival' or 'departure' determining which trip should be considered (in- or outbound)

    Returns
    -------
    arrival_departure_time: time of crossing as a pandas timestamp
    arrival_departure_distance: distance to the crossing point from the origin of the trip
    """

    arrival_departure_time = pd.Timestamp('NaT')
    arrival_departure_distance = np.NaN
    times = df.times.iloc[0]
    trajectory_geometry = df.geometry.iloc[0]
    trajectory_geometry = transform(wgs_to_utm,trajectory_geometry)
    area_geometry = transform(wgs_to_utm,area_geometry)
    distance = 0
    for index,(start_point_segment,end_point_segment) in enumerate(zip(trajectory_geometry.coords[:-1],trajectory_geometry.coords[1:])):
        line = LineString([start_point_segment,end_point_segment])
        distance += line.length
        if line.intersects(area_geometry):
            arrival_departure_point = line.intersection(area_geometry)
            offset_percentage = line.boundary.geoms[0].distance(arrival_departure_point)/(line.boundary.geoms[0].distance(arrival_departure_point)+line.boundary.geoms[1].distance(arrival_departure_point))
            arrival_departure_time = (times[index+1]-times[index])*offset_percentage+times[index]
            arrival_departure_distance = distance-line.length*(1-offset_percentage)
            if mode == 'departure':
                arrival_departure_distance = trajectory_geometry.length - arrival_departure_distance
                break
                
    return arrival_departure_time,arrival_departure_distance

def merge_data(df,variable_name):
    """ 
    Function that merges the data of a dataframe into a single variable

    Parameters
    ----------
    df: pandas dataframe with trips
    variable_name: column name of data to be merged

    returns: a list with the data of the column
    """
    
    variable = []
    for row_index in range(len(df)):
        variable.extend(df.iloc[row_index][variable_name])
    return variable
    
def create_vessel_journeys(df,berths,turning_basins,harbour_basins,scheme_information):
    """ 
    Function that creates a dataframe of vessel voyages consisting of an in- and outbound trip

    Parameters
    ----------
    df: pandas dataframe with trips
    berths: dataframe with geometries of the berhts, should contain the harbour basin nam
    turning_basins: dataframe with geometries of the turning basins
    harbour_basins: dataframe with geometries of the harbour basins
    scheme_information: information in a dictionary with column names as names and pyarrow datatypes as values

    returns: a list with the data of the column
    """
    columns = scheme_information.keys()
    df_trips = pd.DataFrame(columns=columns)
    for datacolumn,datatype in scheme_information.items(): 
        df_trips[datacolumn] = df_trips[datacolumn].astype(datatype.to_pandas_dtype())
    ship_name = ''
    trip_number = np.NaN
    for ship_name in list(dict.fromkeys(df.name)):
        ship_df = df[df.name == ship_name]
        ship_name = ship_name
        for trip_number in list(dict.fromkeys(ship_df.trip_id)):
            trip_df = ship_df[ship_df.trip_id == trip_number]
            trip_number = trip_number
            trip_anchorage = trip_df[trip_df.bound == 'to_anchorage']
            trip_arrival = trip_df[trip_df.bound == 'to_terminal']
            trip_departure = trip_df[trip_df.bound == 'from_terminal']

            #Defaults
            origin = Point()
            anchorage_at_arrival = ''
            turning_basin_at_arrival = ''
            berth_of_call = ''
            turning_basin_at_departure = ''
            destination = Point()
            draught_at_arrival = np.NaN
            draught_at_departure = np.NaN
            arrival_at_port = pd.Timestamp('NaT')
            arrival_at_anchorage_at_arrival = pd.Timestamp('NaT')
            departure_from_anchorage_at_arrival = pd.Timestamp('NaT')
            arrival_at_port_entrance = pd.Timestamp('NaT')
            arrival_at_harbour_entrance = pd.Timestamp('NaT')
            arrival_at_turning_basin = pd.Timestamp('NaT')
            arrival_at_berth = pd.Timestamp('NaT')
            departure_from_berth = pd.Timestamp('NaT')
            departure_from_turning_basin = pd.Timestamp('NaT')
            departure_from_harbour_entrance = pd.Timestamp('NaT')
            departure_from_port_entrance = pd.Timestamp('NaT')
            departure_from_port = pd.Timestamp('NaT')
            geometry_anchorage = LineString()
            geometry_arrival = LineString()
            geometry_departure = LineString()
            sailing_distance_to_anchorage = np.NaN
            sailing_distance_to_port_entrance = np.NaN
            sailing_distance_to_harbour_entrance = np.NaN
            sailing_distance_to_turning_basin = np.NaN
            sailing_distance_to_berth = np.NaN
            sailing_distance_from_berth = np.NaN
            sailing_distance_from_turning_basin = np.NaN
            sailing_distance_from_harbour_entrance = np.NaN
            sailing_distance_from_port_entrance = np.NaN
            times = []
            coordinates = []
            sog = []
            cog = []
            speed = []
            direction = []
            acceleration = []
    
            #Geometries
            geometry_anchorage = np.NaN
            geometry_arrival = np.NaN
            geometry_departure = np.NaN
            if len(trip_anchorage):
                geometry_anchorage = trip_anchorage.geometry.iloc[0]
            if len(trip_arrival):
                geometry_arrival = trip_arrival.geometry.iloc[0]
            if len(trip_departure):
                geometry_departure = trip_departure.geometry.iloc[0]
        
            #Parameters
            coordinates = merge_data(trip_df,'coordinates')
            times = merge_data(trip_df,'times')
            sog = merge_data(trip_df,'sog')
            cog = merge_data(trip_df,'cog')
            speed = merge_data(trip_df,'speed')
            direction = merge_data(trip_df,'direction')
            acceleration = merge_data(trip_df,'acceleration')
        
            #Origin
            harbour_entrance_transect = LineString()
            if len(trip_arrival):
                if len(trip_anchorage):
                    origin = trip_anchorage.origin.iloc[0]
                    arrival_at_port = trip_anchorage.arrival.iloc[0]
                else:
                    origin = trip_arrival.origin.iloc[0]
                    arrival_at_port = trip_arrival.arrival.iloc[0]
                berth = trip_arrival.destination.iloc[0]
                draught_at_arrival = trip_arrival.draught.iloc[0]
                if berth in list(berths.index):
                    if berths.loc[berth].Harbour_basin in list(harbour_basins.Name):
                        harbour_entrance_transect = harbour_basins[harbour_basins.Name == berths.loc[berth].Harbour_basin].iloc[0].geometry.exterior
            else:
                origin = berth = trip_departure.origin.iloc[0]
                arrival_at_port = trip_departure.departure.iloc[0]
                draught_at_arrival = trip_departure.draught.iloc[0]
                if berth in list(berths.index):
                    if berths.loc[berth].Harbour_basin in list(harbour_basins.Name):
                        harbour_entrance_transect = harbour_basins[harbour_basins.Name == berths.loc[berth].Harbour_basin].iloc[0].geometry.exterior
            
            #Anchorage at arrival
            anchorage_at_arrival = np.NaN
            arrival_at_anchorage_at_arrival = pd.Timestamp('NaT')
            departure_from_anchorage_at_arrival = pd.Timestamp('NaT')
            sailing_distance_to_anchorage = np.NaN
            if len(trip_anchorage):
                anchorage_at_arrival = trip_anchorage.destination.iloc[0]
                arrival_at_anchorage_at_arrival = trip_anchorage.arrival.iloc[0]
                departure_from_anchorage_at_arrival = trip_arrival.departure.iloc[0]
                sailing_distance_to_anchorage = trip_anchorage.distance.iloc[0]
        
            #Entry
            arrival_at_port_entrance = pd.Timestamp('NaT')
            sailing_distance_to_port_entrance = np.NaN
            if len(trip_arrival):
                arrival_at_port_entrance,sailing_distance_to_port_entrance = find_time_and_location_of_crossing(trip_arrival,port_entrance_transect,'arrival')
                arrival_at_harbour_entrance,sailing_distance_to_harbour_entrance = find_time_and_location_of_crossing(trip_arrival,harbour_entrance_transect,'arrival')
        
            #Turning Basin
            turning_basin_at_arrival,turning_basin_at_departure,arrival_at_turning_basin,departure_from_turning_basin,sailing_distance_to_turning_basin,sailing_distance_from_turning_basin = determine_location_of_turning(trip_arrival,trip_departure) 
        
            #Berth inbound
            berth_of_call = np.NaN
            arrival_at_berth = pd.Timestamp('NaT')
            sailing_distance_to_berth = np.NaN
            if len(trip_arrival):
                berth_of_call = trip_arrival.destination.iloc[0]
                arrival_at_berth = trip_arrival.arrival.iloc[0]
                sailing_distance_to_berth = transform(wgs_to_utm,geometry_arrival).length
            
            #Berth outbound
            departure_from_berth = pd.Timestamp('NaT')
            sailing_distance_from_berth = np.NaN
            if len(trip_departure):
                berth_of_call = trip_departure.origin.iloc[0]
                departure_from_berth = trip_departure.departure.iloc[0]
                sailing_distance_from_berth = transform(wgs_to_utm,geometry_departure).length
        
            #Departure
            departure_from_port_entrance = pd.Timestamp('NaT')
            sailing_distance_from_port_entrance = np.NaN
            if len(trip_departure):
                departure_from_port_entrance,sailing_distance_from_port_entrance = find_time_and_location_of_crossing(trip_departure,port_entrance_transect,'departure')
                departure_from_harbour_entrance,sailing_distance_from_harbour_entrance = find_time_and_location_of_crossing(trip_departure,harbour_entrance_transect,'departure')
            
            #Destination  
            if len(trip_departure):
                destination = trip_departure.destination.iloc[0]
                draught_at_departure = trip_departure.draught.iloc[0]
                departure_from_port = trip_departure.arrival.iloc[0]
            else:
                destination = trip_arrival.destination.iloc[0]
                draught_at_departure = trip_arrival.draught.iloc[0]
                departure_from_port = trip_arrival.arrival.iloc[0]

            data = [trip_number,
                    ship_name,
                    origin,
                    anchorage_at_arrival,
                    turning_basin_at_arrival,
                    berth_of_call,
                    turning_basin_at_departure,
                    destination,
                    draught_at_arrival,
                    draught_at_departure,
                    arrival_at_port,
                    arrival_at_anchorage_at_arrival,
                    departure_from_anchorage_at_arrival,
                    arrival_at_port_entrance,
                    arrival_at_harbour_entrance,
                    arrival_at_turning_basin,
                    arrival_at_berth,
                    departure_from_berth,
                    departure_from_turning_basin,
                    departure_from_harbour_entrance,
                    departure_from_port_entrance,
                    departure_from_port,
                    geometry_anchorage,
                    geometry_arrival,
                    geometry_departure,
                    sailing_distance_to_anchorage,
                    sailing_distance_to_port_entrance,
                    sailing_distance_to_harbour_entrance,
                    sailing_distance_to_turning_basin,
                    sailing_distance_to_berth,
                    sailing_distance_from_berth,
                    sailing_distance_from_turning_basin,
                    sailing_distance_from_harbour_entrance,
                    sailing_distance_from_port_entrance,
                    times,
                    coordinates,
                    sog,
                    cog,
                    speed,
                    direction,
                    acceleration]    
            df_trips = pd.concat([df_trips,pd.DataFrame(data=[data],columns=columns)])
    return df_trips

def change_data_format(df,data_columns,scheme_information):
    """ 
    Function that transforms the geometry data into string data in order to save it in dask

    Parameters
    ----------
    df: pandas dataframe with geometries
    data_columns: column names of geometries

    :returns: pandas dataframe
    """
    
    new_df = pd.DataFrame(columns=scheme_information.keys())
    for datacolumn,datatype in scheme_information.items(): 
        new_df[datacolumn] = new_df[datacolumn].astype(datatype.to_pandas_dtype())
    for loc,info in df.iterrows():
        row_df = pd.DataFrame([info])
        for data_column in data_columns:
            if type(info[data_column]) == list or type(info[data_column]) == tuple:
                for index,data in enumerate(info[data_column]):
                    if not index:
                        row_df.loc[loc,data_column] = [str(data)]
                    else:
                        row_df.loc[loc,data_column].append(str(data))
            else:
                row_df.loc[loc,data_column] = str(info[data_column])
        new_df = pd.concat([new_df,row_df])
    return new_df

def reset_index(df):
    """ 
    Function that resets the index of a pandas dataframe 

    Parameters
    ----------
    df: pandas dataframe

    :returns: pandas dataframe
    """
    df = df.reset_index(drop=True)
    return df

def set_time_UTC(df,time_columns):
    """ 
    Function that converts the datetimes into datetime with a time zone in utc

    Parameters
    ----------
    df: pandas dataframe
    time_columns: columns that contain datetime elements

    :returns: pandas dataframe
    """
    df_new = pd.DataFrame(columns=time_columns)
    for time_column in time_columns:
        df_new[time_column] = pd.to_datetime([None]*len(df[time_column]), utc=True)
        for loc,time_info in df[[time_column]].iterrows():
            time = time_info[time_column]
            if isinstance(time,pd.Timestamp):
                if not time.tz:
                    df_new.loc[loc,time_column] = time.tz_localize(datetime.timezone.utc)
                else:
                    df_new.loc[loc,time_column] = time

        df[time_column] = df_new[time_column]
    return df

In [None]:
scheme_information = {'trip_number': pa.int64(),
                      'name': pa.string(),
                      'origin': pa.string(),
                      'anchorage_at_arrival':pa.string(),
                      'turning_basin_at_arrival':pa.string(),
                      'berth_of_call': pa.string(),
                      'turning_basin_at_departure': pa.string(), 
                      'destination': pa.string(),
                      'draught_at_arrival': pa.float64(),
                      'draught_at_departure': pa.float64(),
                      'arrival_at_port': pa.timestamp('ns', tz='UTC'),
                      'arrival_at_anchorage_at_arrival': pa.timestamp('ns', tz='UTC'),
                      'departure_from_anchorage_at_arrival': pa.timestamp('ns', tz='UTC'),
                      'arrival_at_port_entrance': pa.timestamp('ns', tz='UTC'),
                      'arrival_at_harbour_entrance': pa.timestamp('ns', tz='UTC'),
                      'arrival_at_turning_basin': pa.timestamp('ns', tz='UTC'),
                      'arrival_at_berth': pa.timestamp('ns', tz='UTC'),
                      'departure_from_berth': pa.timestamp('ns', tz='UTC'),
                      'departure_from_turning_basin': pa.timestamp('ns', tz='UTC'),
                      'departure_from_harbour_entrance': pa.timestamp('ns', tz='UTC'),
                      'departure_from_port_entrance': pa.timestamp('ns', tz='UTC'),
                      'departure_from_port': pa.timestamp('ns', tz='UTC'),
                      'geometry_anchorage': pa.string(),
                      'geometry_entry': pa.string(),
                      'geometry_departure': pa.string(),
                      'sailing_distance_to_anchorage': pa.float64(),
                      'sailing_distance_to_port_entrance': pa.float64(),
                      'sailing_distance_to_harbour_entrance': pa.float64(),
                      'sailing_distance_to_turning_basin': pa.float64(),
                      'sailing_distance_to_berth': pa.float64(),
                      'sailing_distance_from_berth': pa.float64(),
                      'sailing_distance_from_turning_basin': pa.float64(),
                      'sailing_distance_from_harbour_entrance': pa.float64(),
                      'sailing_distance_from_port_entrance': pa.float64(),
                      'times': pa.list_(pa.timestamp('us')),
                      'coordinates': pa.list_(pa.string()),
                      'sog': pa.list_(pa.float64()),
                      'cog': pa.list_(pa.float64()),
                      'speed': pa.list_(pa.float64()),
                      'direction': pa.list_(pa.float64()),
                      'acceleration': pa.list_(pa.float64())}

time_columns = ['arrival_at_port',
                'arrival_at_anchorage_at_arrival',
                'departure_from_anchorage_at_arrival',
                'arrival_at_port_entrance',
                'arrival_at_harbour_entrance',
                'arrival_at_turning_basin',
                'arrival_at_berth',
                'departure_from_berth',
                'departure_from_turning_basin',
                'departure_from_harbour_entrance',
                'departure_from_port_entrance',
                'departure_from_port']

voyage_df = pd.DataFrame(columns=list(scheme_information.keys()))
for datacolumn,datatype in scheme_information.items(): 
    voyage_df[datacolumn] = voyage_df[datacolumn].astype(datatype.to_pandas_dtype())

In [None]:
ddf = dd.read_parquet(path_name+'/trip_dataframe',storage_options={"account_name": "rwsais", "sas_token": sas_token})

In [None]:
ddf_i = ddf.partitions[:]
ddf_i = ddf_i.map_partitions(convert_string_geometry_to_shapely_geometry,['coordinates','geometry'])
ddf_i = ddf_i.map_partitions(add_bounds)
ddf_i = ddf_i.map_partitions(create_vessel_journeys,berths,turning_basins,harbour_basins,scheme_information=scheme_information,meta=voyage_df)
ddf_i = ddf_i.map_partitions(change_data_format,data_columns=['origin',
                                                              'destination',
                                                              'geometry_anchorage', 
                                                              'geometry_entry', 
                                                              'geometry_departure',
                                                              'coordinates'],scheme_information=scheme_information,meta=voyage_df)
ddf_i = ddf_i.map_partitions(reset_index,meta=voyage_df)
ddf_i = ddf_i.map_partitions(set_time_UTC,time_columns=time_columns,meta=voyage_df)

for datacolumn,datatype in scheme_information.items(): 
   ddf_i[datacolumn] = ddf_i[datacolumn].astype(datatype.to_pandas_dtype())
    
ddf_i.to_parquet(path_name+'/voyage_dataframe',storage_options={"account_name": "rwsais", "sas_token": sas_token},schema=scheme_information,engine='pyarrow')

In [None]:
cluster.close()