# Machine Learning Method Selection
This Python code imports several libraries used for data analysis, machine learning, and data visualization (like sklearn, pandas, numpy, etc). It also modifies the system path to include the parent directory, which allows for the import of custom modules located elsewhere.

In [1]:
import sklearn
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns

import os
import sys
# sys.path.append(os.path.abspath(os.path.join('..')))
sys.path.insert(0, '..')

# https://scikit-learn.org/stable/index.html

# https://scikit-learn.org/stable/auto_examples/index.html
# - https://scikit-learn.org/stable/auto_examples/linear_model/index.html

# https://scikit-learn.org/stable/tutorial/index.html
# - https://scikit-learn.org/stable/tutorial/statistical_inference/index. html

# https://scikit-learn.org/stable/user_guide.html
# - https://scikit-learn.org/stable/supervised_learning.html

# https://scikit-learn.org/stable/modules/classes.html#


In [2]:
import predictor.broker as broker
broker.ETTAgent

predictor.broker.ETTAgent

We  imports necessary modules, reads two CSV files (rotterdam_hamburg.csv & felixstowe_rotterdam.csv)into pandas DataFrames, applies a cleaning function from the predictor.clean module to each DataFrame, and then concatenates the two cleaned DataFrames into one combined DataFrame.

In [3]:

import pandas as pd
from sklearn.model_selection import train_test_split


import predictor.clean as clean


df_rtm_ham_raw = pd.read_csv(
    '../resources/rotterdam_hamburg/rotterdam_hamburg.csv', quotechar="'")
df_rtm_ham = clean.clean_up(df_rtm_ham_raw)
print(
    f"df_rtm_ham: {df_rtm_ham_raw.shape} to {df_rtm_ham.shape} ({df_rtm_ham_raw.shape[0] - df_rtm_ham.shape[0]})")


df_fxt_rtm_raw = pd.read_csv(
    '../resources/felixstowe_rotterdam/felixstowe_rotterdam.csv', quotechar="'")
df_fxt_rtm = clean.clean_up(df_fxt_rtm_raw)
print(
    f"df_fxt_rtm: {df_fxt_rtm_raw.shape} to {df_fxt_rtm.shape} ({df_fxt_rtm_raw.shape[0] - df_fxt_rtm.shape[0]})")

df_combined_raw = pd.concat(
    [df_rtm_ham_raw, df_fxt_rtm_raw], ignore_index=True)
df_combined = pd.concat([df_rtm_ham, df_fxt_rtm], ignore_index=True)


  df_rtm_ham_raw = pd.read_csv(


df_rtm_ham: (500142, 25) to (351170, 25) (148972)


  df_fxt_rtm_raw = pd.read_csv(


df_fxt_rtm: (527359, 26) to (239546, 25) (287813)


This code selects specific features from the combined DataFrame for further processing. These features are then prepared using a function from the predictor.clean module. Three time-related features are dropped from the data, then the remaining data (x) and target variable (y) are split into training and testing sets using scikit-learn's train_test_split() function.

In [4]:
from sklearn import tree
from sklearn import ensemble
from sklearn import linear_model
from sklearn import model_selection
from sklearn import neighbors
from sklearn import svm
from sklearn import preprocessing

from sklearn import pipeline


In [5]:
import sklearn as sk
from predictor.broker import ETTAgent


def prepare_and_split(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    X, y = clean.prepare_data(df)
    X = X.drop(
        [
            "EndTime", "time",
            "shiptype"
        ],
        axis=1
    )
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def controlled_test(
    ett_agent: ETTAgent,
) -> dict[str, float]:
    ett_agent.train()
    result = ett_agent.get_result_dict()
    print(f"{ett_agent.name}: {result['R2_Score']}")
    return result


### Testing Linear Models

In [6]:
# # loading the data
# ett_agents_dict: dict[str, ETTAgent] = {}
# for file in os.listdir('./models/'):
#     if file.endswith('.joblib'):
#         ett_agents_dict[file.replace('.joblib', '')] = ETTAgent.load(f'./models/{file}')
#     else:
#         print(f'File {file} is not a joblib file')

# # checking whether the data is loaded correctly
# results = {}
# for ett_agent_name, ett_agent in ett_agents_dict.items():
#     result = ett_agent.get_result_dict()
#     print(f"{ett_agent_name}: {result['R2_Score']}")

We've imported several machine learning methods into our code to compare results and choose the best one. These methods are:


**Random Forest Regressor**: An ensemble learning method that fits multiple decision trees on various sub-samples of the dataset and averages the predictions.

**Gradient Boosting Regressor**: Sequentially adds predictors to an ensemble, each one correcting its predecessor, to reduce both bias and variance.

**AdaBoost Regressor**: An ensemble learning method that combines multiple weak regressors by assigning weights to the training data points and adjusting the weights based on the performance of each weak regressor.

**Bagging Regressor**: An ensemble learning method that fits multiple instances of a base regressor on random subsets of the training data and averages their predictions.

K-Nearest Neighbors (KNN) Regressor: Predicts the target by finding the most similar instances in the training set and averaging their targets.

In [7]:
df_rtm_ham.keys()


Index(['TripID', 'MMSI', 'ID', 'StartLatitude', 'StartLongitude',
       'EndLatitude', 'EndLongitude', 'Latitude', 'Longitude', 'StartTime',
       'EndTime', 'time', 'StartPort', 'EndPort', 'shiptype', 'Length',
       'Breadth', 'Draught', 'SOG', 'COG', 'TH', 'Destination', 'Name',
       'Callsign', 'AisSourcen'],
      dtype='object')

In [8]:
test_ett_agent = ETTAgent(
    name='test_ett_agent',
    estimator=linear_model.LinearRegression(),
    data=df_rtm_ham,
    custom_feature_include_list=[
        "EndLatitude", "EndLongitude",
        "Latitude", "Longitude",
        "Length",
        "SOG"
    ]
)
test_ett_agent.train()
test_ett_agent.dump("test")

test_ett_agent.test()
ETTAgent.load("test/test_ett_agent.joblib").test()


array([66768.78113536, 43256.99946555, 18947.16998636, ...,
       63746.62053703, 19453.21275856, 12439.26605158])

In [9]:
test_ett_agent.X_test.keys()

Index(['EndLatitude', 'EndLongitude', 'Latitude', 'Longitude', 'Length',
       'SOG'],
      dtype='object')

In [10]:
klll

NameError: name 'klll' is not defined

In [None]:
linear_models = {
    'LinearRegression': linear_model.LinearRegression(),
    'Ridge': linear_model.Ridge(),
    'Lasso': linear_model.Lasso(),
    'ElasticNet': linear_model.ElasticNet(),
    'Lars': linear_model.Lars(),
    'LassoLars': linear_model.LassoLars(),
    'BayesianRidge': linear_model.BayesianRidge(),
}

tree_models = {
    'DecisionTreeRegressor': tree.DecisionTreeRegressor(),
}

ensamble_models = {
    'RandomForestRegressor': ensemble.RandomForestRegressor(n_estimators=10),
    # 'GradientBoostingRegressor': ensemble.GradientBoostingRegressor(),
    # 'AdaBoostRegressor': ensemble.AdaBoostRegressor(),
    'BaggingRegressor': ensemble.BaggingRegressor(),
}

neighbors_models = {
    'KNeighborsRegressor': neighbors.KNeighborsRegressor(),
}

# comebine the above lists into one list
all_models = {}
all_models.update(linear_models)
all_models.update(tree_models)
all_models.update(ensamble_models)
all_models.update(neighbors_models)
# print(all_models)
results = {}
ett_agents = []
for name, estimator in all_models.items():
    ett_agent = ETTAgent(
        name=name,
        estimator=estimator,
        data=df_rtm_ham,
        custom_feature_include_list=[
            "EndLatitude", "EndLongitude",
            "Latitude", "Longitude",
            "Length", # "Breadth",
            "SOG",
        ]
    )
    results[name] = controlled_test(ett_agent)
    # ett_agent.dump('./models')
    # ett_agents.append(ett_agent)

report = pd.DataFrame.from_dict(results, orient='index')
report.sort_values(by=['R2_Score'], ascending=False)


# Best models + scalars
In the below cell, we compare the four best models with all (?) scalars



What we learn is a few things.

- Random Forest Regressor is the best followed by the Bagging Regressor.
- Robust Scalar seems to be the most performant in general.
- The Normalizer scalar increases the waiting time by a large margin
- In general the "scalars", **Normalizer**, **QuantileTransformer (normal and uniform)** and **PowerTransformer** aren't that good.

From now on, we will only use **RandomForestRegressor** and **BaggingRegressor** with **MinMaxScalar** and **RobustScalar**.

In [12]:
def determine_trip_route(df: pd.DataFrame) -> str:
    result: str = ""
    start_port = df.iloc[0]['StartPort']
    end_port = df.iloc[0]['EndPort']

    if start_port == "ROTTERDAM" and end_port == "HAMBURG":
        result = "rtm_ham"
    elif start_port == "FELIXSTOWE" and end_port == "ROTTERDAM":
        result = "fxt_rtm"
    else:
        raise ValueError("Invalid trip route")
    return result

determine_trip_route(df_rtm_ham)

'rtm_ham'

In [None]:
print(sklearn.__version__)

In [16]:
scalars = {
    '': None,
    'StandardScaler': preprocessing.StandardScaler(),
    'MinMaxScaler': preprocessing.MinMaxScaler(),
    'MaxAbsScaler': preprocessing.MaxAbsScaler(),
    'RobustScaler': preprocessing.RobustScaler(),
    # # NOTE: The scalars below are not that good.
    # 'Normalizer': preprocessing.Normalizer(),
    # 'QuantileTransformer-Normal': preprocessing.QuantileTransformer(output_distribution='normal'),
    # 'QuantileTransformer-Uniform': preprocessing.QuantileTransformer(output_distribution='uniform'),
    # 'PowerTransformer': preprocessing.PowerTransformer(),
}

ett_agets_dict = {
    'RandomForestRegressor': ensemble.RandomForestRegressor(n_estimators=10),
    'BaggingRegressor': ensemble.BaggingRegressor(),
    'LinearRegression': linear_model.LinearRegression(),
    'KNeighborsRegressor': neighbors.KNeighborsRegressor(),
}

results = {}
ett_agents = []

for model_name, model in ett_agets_dict.items():
    for scalar_name, scalar in scalars.items():
        ett_agent = ETTAgent(
            name=f"{model_name}-{scalar_name if scalar_name else 'NoScalar'}",
            estimator=pipeline.make_pipeline(scalar, model),
            data=df_fxt_rtm,
            custom_feature_include_list=[
                "EndLatitude", "EndLongitude",
                "Length",
                "Latitude", "Longitude",
                "SOG",
            ]
        )
        results[ett_agent.name] = controlled_test(ett_agent)
        ett_agent.dump(f'../predictor/models/{determine_trip_route(df_fxt_rtm)}')
        ett_agents.append(ett_agent)

report = pd.DataFrame.from_dict(results, orient='index')
report.sort_values(by=['R2_Score'], ascending=False)


RandomForestRegressor-NoScalar: 0.9672350479866193
RandomForestRegressor-StandardScaler: 0.9674937763829506
RandomForestRegressor-MinMaxScaler: 0.9677704490747941
RandomForestRegressor-MaxAbsScaler: 0.9672282442464158
RandomForestRegressor-RobustScaler: 0.9676647624129505
BaggingRegressor-NoScalar: 0.96760328260728
BaggingRegressor-StandardScaler: 0.9674751390321701
BaggingRegressor-MinMaxScaler: 0.9675798191214784
BaggingRegressor-MaxAbsScaler: 0.9677082180033267
BaggingRegressor-RobustScaler: 0.9674625473761835
LinearRegression-NoScalar: 0.8714616606602892
LinearRegression-StandardScaler: 0.8714616606602896
LinearRegression-MinMaxScaler: 0.8714616606602894
LinearRegression-MaxAbsScaler: 0.871461660660289
LinearRegression-RobustScaler: 0.8714616606602896
KNeighborsRegressor-NoScalar: 0.9566785052903908
KNeighborsRegressor-StandardScaler: 0.965343580762632
KNeighborsRegressor-MinMaxScaler: 0.966978450152101
KNeighborsRegressor-MaxAbsScaler: 0.9597605575601368
KNeighborsRegressor-Robust

Unnamed: 0,R2_Score,time,MAE 0,RMSE 0,Predictions
RandomForestRegressor-MinMaxScaler,0.96777,6.477368,"13 minutes, 34 seconds","27 minutes, 24 seconds","[23786.4, 32652.0, 5342.457142857143, 7262.0, ..."
BaggingRegressor-MaxAbsScaler,0.967708,6.611809,"13 minutes, 30 seconds","27 minutes, 26 seconds","[23880.0, 36258.0, 5151.8, 7350.0, 24609.0, 22..."
RandomForestRegressor-RobustScaler,0.967665,6.386238,"13 minutes, 34 seconds","27 minutes, 27 seconds","[23747.5, 32364.0, 5480.2, 7388.4, 24204.0, 23..."
BaggingRegressor-NoScalar,0.967603,6.288185,"13 minutes, 30 seconds","27 minutes, 28 seconds","[23894.0, 27258.0, 5227.214285714285, 7086.0, ..."
BaggingRegressor-MinMaxScaler,0.96758,6.406582,"13 minutes, 32 seconds","27 minutes, 29 seconds","[23902.0, 28590.0, 5078.0, 6947.0, 24270.0, 22..."
RandomForestRegressor-StandardScaler,0.967494,6.392709,"13 minutes, 34 seconds","27 minutes, 31 seconds","[23713.5, 32118.0, 5324.214285714285, 7144.5, ..."
BaggingRegressor-StandardScaler,0.967475,6.331311,"13 minutes, 31 seconds","27 minutes, 31 seconds","[24042.0, 27366.0, 5568.2, 6927.75, 24252.0, 2..."
BaggingRegressor-RobustScaler,0.967463,6.572836,"13 minutes, 32 seconds","27 minutes, 32 seconds","[23974.2, 27948.0, 5066.5, 7536.0, 24654.0, 22..."
RandomForestRegressor-NoScalar,0.967235,6.327099,"13 minutes, 34 seconds","27 minutes, 38 seconds","[23605.0, 30690.0, 5260.3, 6717.5, 24460.0, 23..."
RandomForestRegressor-MaxAbsScaler,0.967228,6.429845,"13 minutes, 35 seconds","27 minutes, 38 seconds","[24144.0, 29382.0, 5197.4, 7228.5, 24684.0, 22..."


# Testing out meta hyperparameters for RandomForest
Here we test out the hyperparameters

In [None]:
scalars = {
    '': None,
    'RobustScaler': preprocessing.RobustScaler(),
    'MinMaxScaler': preprocessing.MinMaxScaler(),
}

ett_agets_dict = {
    'RandomForestRegressor max_depth=25': ensemble.RandomForestRegressor(n_estimators=20, n_jobs=-1, max_depth=25),
    'RandomForestRegressor max_depth=50': ensemble.RandomForestRegressor(n_estimators=20, n_jobs=-1, max_depth=50),
    'RandomForestRegressor max_depth=75': ensemble.RandomForestRegressor(n_estimators=20, n_jobs=-1, max_depth=75),
    'RandomForestRegressor max_depth=100': ensemble.RandomForestRegressor(n_estimators=20, n_jobs=-1, max_depth=100),
    'BaggingRegressor': ensemble.BaggingRegressor(),
}

results = {}
ett_agents = []
for name, estimator in ett_agets_dict.items():
    for scalar_name, scalar in scalars.items():
        ett_agent = ETTAgent(
            name=f"{name}-{scalar_name if scalar_name else 'NoScalar'}",
            estimator=pipeline.make_pipeline(scalar, estimator),
            data=df_rtm_ham,
            custom_feature_include_list=[
                "EndLatitude", "EndLongitude",
                "Length", # "Breadth",
                "Latitude", "Longitude",
                "SOG",
            ]
        )
        results[ett_agent.name] = controlled_test(ett_agent)
        ett_agent.dump('./models')
        ett_agents.append(ett_agent)

report = pd.DataFrame.from_dict(results, orient='index')
report.sort_values(by=['R2_Score'], ascending=False)

In [None]:
ett_agents_dict: dict[str, ETTAgent] = {}
files = os.listdir('./models/')
for file in files:
    if file.endswith('.joblib'):
        print(f'Loading file {file}')
        ett_agents_dict[file.replace('.joblib', '')] = ETTAgent.load(f'./models/{file}')
    else:
        print(f'File {file} is not a joblib file')
print(ett_agents_dict)

# Test all models on the test data
results = {}
for ett_agent_name, ett_agent in ett_agents_dict.items():
    result = ett_agent.get_result_dict()
    print(f"{ett_agent_name}: {result['R2_Score']}")
    results[ett_agent_name] = result

report = pd.DataFrame.from_dict(results, orient='index')
report = report.sort_values(by=['R2_Score'], ascending=False)
report


In [None]:
ett_agents_dict

In [None]:
split_into_distance_groups(df_rtm_ham, 50)


In [None]:
from IPython.display import display


def split_into_distance_groups(df: pd.DataFrame, group_size: int) -> list[pd.DataFrame]:
    '''
        Splits the dataframe into groups of distance_remaining_in_km

        Parameters:
        ___________
        df: pd.DataFrame
            The dataframe to split
        group_size: int
            The size of the groups

        Returns:
        ________
        list[pd.DataFrame]
            A list of dataframes
    '''
    max_distance: int = df["distance_remaining_in_km"].max()
    dfs = []
    i = 0
    while i < max_distance:
        df_filtered = df[(df["distance_remaining_in_km"] >= i) &
                         (df["distance_remaining_in_km"] < group_size + i)]
        dfs.append(df_filtered)
        i += group_size
    return dfs


def test_fun(df: pd.DataFrame):
    scalars = {
        '': None,
        'StandardScaler': preprocessing.StandardScaler(),
        'MinMaxScaler': preprocessing.MinMaxScaler(),
        'MaxAbsScaler': preprocessing.MaxAbsScaler(),
        'RobustScaler': preprocessing.RobustScaler(),
    }

    models = {
        'LinearRegression': linear_model.LinearRegression(),
        'RandomForestRegressor': ensemble.RandomForestRegressor(n_estimators=70, n_jobs=-1),
        'BaggingRegressor': ensemble.BaggingRegressor(
            n_estimators=20, n_jobs=-1
        ),
    }

    grps = split_into_distance_groups(df, 25)
    drop_list = [
        "EndTime", "time",
        "shiptype",
        # "distance_remaining_in_km",
        # "mean_size",
        # "mean_dir_cos", "mean_dir_sin"
    ]
    import math
    for grp in grps:
        min_distance = math.floor(grp['distance_remaining_in_km'].min())
        max_distance = math.ceil(grp['distance_remaining_in_km'].max())
        print(f"Group: {min_distance} - {max_distance}")
        X, y = clean.prepare_data(grp)
        X = X.drop(drop_list, axis=1)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42)

        results = {}
        for model_name, model in models.items():
            for scalar_name, scalar in scalars.items():
                ett_agent = ETTAgent(
                    name=f"{model_name}-{scalar_name if scalar_name else 'NoScalar'}",
                    estimator=pipeline.make_pipeline(scalar, model),
                    data=df_rtm_ham,
                )
                results[ett_agent.name] = controlled_test(ett_agent)

        report = pd.DataFrame.from_dict(results, orient='index')

        # for model_name in report.index:
        #     plot_resutls(model_name, results)

        report = report.sort_values(by=['R2_Score'], ascending=False)
        display(report.head(n=2))


# test_fun(df_fxt_rtm)
test_fun(df_rtm_ham)
