In [None]:
import pandas as pd
from pandas.core.dtypes.common import is_datetime64_any_dtype

vessel_positions = pd.read_csv('data/vessel_positions.csv')
vessel_characteristics = pd.read_csv('data/vessel_characteristics.csv')

The to_timestamp function is used convert datetime columns of string objects to datetime64[ns, UTC] dtype. This data type conversion provides accurate comparisons, date arithmetic, timezone handling, faster operations, less storage, and cleaner/built-in support (pandas and other libraries).

Three additional functions, object_to_string, float_to_Int64 and int01_to_boolean, were implemented to convert columns from the default data types to the data types that suit the contents of each column best.

In [None]:
'''
Convert a column that mixes Excel-serial numbers and “DD/MM/YYYY HH:MM” strings into one UTC-timestamp Series.
Rounds timings to the nearest second.

Returns unchanged if the column is already a datatime dtype. Otherwise, returns a Series of dtype datetime64[ns, UTC]; unparseable rows become NaT.
'''

def to_timestamp(dataframe, column_name):

    s = dataframe[column_name]

    if is_datetime64_any_dtype(s):
        return s

    numbers = pd.to_numeric(s, errors="coerce") # floats where possible, NaN otherwise
    dt1 = pd.to_datetime(numbers, unit="D", origin="1899-12-30", utc=True) # convert floats to timestamps

    dates = s[numbers.isna()] # take the non-numeric rows

    # convert dates to timestamps
    dt2 = pd.to_datetime(dates, format="%d/%m/%Y %H:%M", dayfirst=True, utc=True, errors="coerce")

    final_column = dt1.fillna(dt2) # merge dt1 and dt2 into a single series of timestamps

    # round the timings to the nearest second
    final_column = final_column.dt.round('s')

    return final_column

In [None]:
'''
Takes a dataframe, searches for text columns of object dtype and converts them to the string dtype.
Returns the modified dataframe.
'''

def object_to_string(dataframe):

    obj_cols = dataframe.select_dtypes(include=["object"]).columns

    for column in obj_cols:
        if all(isinstance(v, str) for v in dataframe[column].dropna()):
            dataframe[column] = dataframe[column].astype("string")

    return dataframe

In [None]:
'''
Takes a dataframe, searches for integer-like columns of float dtype and converts them to Pandas' Int64 dtype.
Returns the modified dataframe.
'''

import numpy as np

def float_to_Int64(dataframe):

    float_cols = dataframe.select_dtypes(include=["float"]).columns

    for column in float_cols:
        if np.isclose(dataframe[column].dropna() % 1, 0).all():
            dataframe[column] = dataframe[column].astype("Int64")

    return dataframe

In [None]:
'''
Takes a dataframe, searches for columns of the integer data type that are boolean in nature and converts them to boolean dtype.
Returns the modified dataframe.
'''

def int01_to_boolean(dataframe):
    int_cols = dataframe.select_dtypes(include=["integer"]).columns

    for column in int_cols:
        if dataframe[column].dropna().isin([0, 1]).all():
            dataframe[column] = dataframe[column].astype("boolean")

    return dataframe

In [None]:
'''
Takes a dataframe column of dates, converts it from object dtype to datetime dtype.
 Returns the modified column.
'''

def to_date(column):

    patterns = ["%d/%m/%Y", "%Y-%m-%d"]

    parsed = pd.Series(pd.NaT, index=column.index) # start all NaT

    for fmt in patterns:
        mask = parsed.isna() # rows still unparsed
        parsed.loc[mask] = pd.to_datetime(
            column.loc[mask],
            format=fmt,
            dayfirst=(fmt == "%d/%m/%Y"),
            errors="coerce"
        )

    return parsed

In [None]:
# vessel_positions data cleaning

# Convert datetime columns of string objects to datetime64[ns, UTC] dtype

datetime_cols = ['zone_entry_time', 'zone_out_time', 'port_entry_time',
        'port_out_time', 'operation_location_entry_time', 'operation_location_out_time',
        'waiting_zone_entry_time', 'waiting_zone_out_time', 'ais_eta', 'last_seen']

for col in datetime_cols:
    vessel_positions[col] = to_timestamp(vessel_positions, col)

date_cols = ['status_date_time', 'updated_at']

for col in date_cols:
    vessel_positions[col] = pd.to_datetime(vessel_positions[col], format="%d/%m/%Y", dayfirst=True, errors="coerce")

# Convert boolean in nature columns from integer dtype to boolean dtype
int01_to_boolean(vessel_positions)

# Convert text columns from object dtype to string dtype
object_to_string(vessel_positions)

# Convert integer-like float columns to Int64
float_to_Int64(vessel_positions)

In [None]:
# vessel_characteristics data cleaning

# Convert datetime columns of string objects to datetime64[ns, UTC] dtype

datetime_cols = ['update_date']

for col in datetime_cols:
    vessel_characteristics[col] = to_timestamp(vessel_characteristics, col)

date_cols = ['built', 'demolition_date', 'keel_laying_date', 'launching_date', 'reported_date', 'cancelled_date', 'last_dd_date', 'next_dd_date', 'last_ss_date', 'next_ss_date', 'conversion_date']

for col in date_cols:
    vessel_characteristics[col] = to_date(vessel_characteristics[col])

#todo:

# Convert text columns from object dtype to string dtype
object_to_string(vessel_characteristics)

# Convert integer-like float columns to Int64
float_to_Int64(vessel_characteristics)

# Convert boolean in nature columns from integer dtype to boolean dtype
# int01_to_boolean(vessel_positions)