In [None]:
import pandas as pd
from pandas.core.dtypes.common import is_datetime64_any_dtype

vessel_positions = pd.read_csv('data/vessel_positions.csv')
vessel_characteristics = pd.read_csv('data/vessel_characteristics.csv')

The to_timestamp function is used convert datetime columns of string objects to datetime64[ns, UTC] dtype. This data type conversion provides accurate comparisons, date arithmetic, timezone handling, faster operations, less storage, and cleaner/built-in support (pandas and other libraries).
 It will be used on the following columns:

- zone_entry_time
- zone_out_time
- port_entry_time
- port_out_time
- operation_location_entry_time
- operation_location_out_time
- waiting_zone_entry_time
- waiting_zone_out_time
- ais_eta
- last_seen

In [149]:
'''
Convert a column that mixes Excel-serial numbers and “DD/MM/YYYY HH:MM” strings into one UTC-timestamp Series.
Rounds timings to the nearest second.

Returns unchanged if the column is already a datatime dtype. Otherwise, returns a Series of dtype datetime64[ns, UTC]; unparseable rows become NaT.
'''

def to_timestamp(dataframe, column_name):

    s = dataframe[column_name]

    if is_datetime64_any_dtype(s):
        return s

    numbers = pd.to_numeric(s, errors="coerce") # floats where possible, NaN otherwise
    dt1 = pd.to_datetime(numbers, unit="D", origin="1899-12-30", utc=True) # convert floats to timestamps

    dates = s[numbers.isna()] # take the non-numeric rows

    # convert dates to timestamps
    dt2 = pd.to_datetime(dates, format="%d/%m/%Y %H:%M", dayfirst=True, utc=True, errors="coerce")

    final_column = dt1.fillna(dt2) # merge dt1 and dt2 into a single series of timestamps

    # round the timings to the nearest second
    final_column = final_column.dt.round('s')

    return final_column

In [None]:
# Convert datetime columns of string objects to datetime64[ns, UTC] dtype

datetime_cols = ['zone_entry_time', 'zone_out_time', 'port_entry_time',
        'port_out_time', 'operation_location_entry_time', 'operation_location_out_time',
        'waiting_zone_entry_time', 'waiting_zone_out_time', 'ais_eta', 'last_seen']

for col in datetime_cols:
    vessel_positions[col] = to_timestamp(vessel_positions, col)

date_cols = ['status_date_time', 'updated_at']

for col in date_cols:
    vessel_positions[col] = pd.to_datetime(vessel_positions[col], format="%d/%m/%Y", dayfirst=True, errors="coerce")