In [1]:
#Libraries used
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.linear_model import LassoCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

In [2]:
#Load database from https://mrv.emsa.europa.eu/#public/emission-report
data20=pd.read_excel("2020-v25-01082021-EU MRV Publication of information.xlsx",index_col=0,skiprows=2)
data20=data20.rename(columns={"Time spent at sea [hours]": "Total time spent at sea [hours]", "Annual Time spent at sea [hours]": "Annual Total time spent at sea [hours]"})

In [3]:
#Removing ships that didn't travel
data20['Annual average Fuel consumption per distance [kg / n mile]'] = pd.to_numeric(data20['Annual average Fuel consumption per distance [kg / n mile]'], errors='coerce')
data20=data20.dropna(subset=["Annual average Fuel consumption per distance [kg / n mile]"])


In [4]:
# Convert technical efficiency to numeric data
data20["Technical efficiency (gCO₂/t·nm)"]=data20["Technical efficiency"].str.replace(r"\D+",'')
data20['Technical efficiency (gCO₂/t·nm)'] = pd.to_numeric(data20['Technical efficiency (gCO₂/t·nm)'], errors='coerce')
data20=data20.dropna(subset=["Technical efficiency (gCO₂/t·nm)"])
data20['Technical efficiency (gCO₂/t·nm)']=data20['Technical efficiency (gCO₂/t·nm)']/100

In [5]:
#Calculating distance of ships
data20["Distance travelled(n miles)"]=(data20["Total fuel consumption [m tonnes]"]/data20["Annual average Fuel consumption per distance [kg / n mile]"])*1000

In [6]:
#Calculating Speed of ships
data20["Average speed (n miles/hour)"]=data20["Distance travelled(n miles)"]/data20["Annual Total time spent at sea [hours]"]

In [7]:
#Create dataset containing independent variable columns to be used to build the model

data=data20[["Ship type","Technical efficiency (gCO₂/t·nm)","Distance travelled(n miles)","Total time spent at sea [hours]"]]
data.head()

Unnamed: 0_level_0,Ship type,Technical efficiency (gCO₂/t·nm),Distance travelled(n miles),Total time spent at sea [hours]
IMO Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6602898,Passenger ship,31.73,7203.528432,488.87
6703343,Other ship types,57.84,5538.650307,603.0
7037806,Ro-pax ship,1.94,14849.610315,995.0
7043843,Ro-ro ship,48.71,40842.097075,3109.2
7128332,Ro-pax ship,9.29,47931.440443,3619.0


In [8]:
# Standardise the numerical variables
Shipping_num = data.drop("Ship type", axis=1)

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

shipping_num_tr = num_pipeline.fit_transform(Shipping_num)

In [9]:
# One hot encode the catergorical variables and combine with the standardised numeric variables

num_attribs = list(Shipping_num)
cat_attribs = ["Ship type"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

Shipping = full_pipeline.fit_transform(data)
Shipping

<10831x18 sparse matrix of type '<class 'numpy.float64'>'
	with 43324 stored elements in Compressed Sparse Row format>

In [10]:
#Create target variable dataframe

TargetV=data20[["Total CO₂ emissions [m tonnes]"]]

In [11]:
#Model 1 Linear regression

lr=LinearRegression()

kfold = KFold(n_splits=8, shuffle=True, random_state=7)
lr_results = cross_val_score(lr, Shipping, TargetV, cv=kfold, scoring="neg_root_mean_squared_error")
print("Mean root mean squared error is: ",-np.round(lr_results.mean(),2),"Standard deviation is: ", np.round(lr_results.std(),2))


Mean root mean squared error is:  6600.08 Standard deviation is:  123.55


In [12]:
#Model 2 Ridge regression

cv_inner = KFold(n_splits=3, shuffle=True, random_state=7)

ridge = linear_model.Ridge()


params ={"alpha":[0.1, 1,10]}


model = GridSearchCV(ridge, params, scoring='neg_root_mean_squared_error', cv=cv_inner, refit=True)

cv_outer = KFold(n_splits=8, shuffle=True, random_state=7)

ridge_results = cross_val_score(model, Shipping, TargetV, scoring='neg_root_mean_squared_error', cv=cv_outer)
print("Mean root mean squared error is: ",-np.round(ridge_results.mean(),2),"Standard deviation is: ", np.round(ridge_results.std(),2))

Mean root mean squared error is:  6600.15 Standard deviation is:  121.66


In [13]:
#Model 3 Lasso regression

cv_inner = KFold(n_splits=3, shuffle=True, random_state=7)

lasso = linear_model.Lasso()


params ={"alpha":[1, 0.1, 0.01,0.001]}


lasso2 = GridSearchCV(lasso, params, scoring='neg_root_mean_squared_error', cv=cv_inner, refit=True)

cv_outer = KFold(n_splits=8, shuffle=True, random_state=7)

lasso_results = cross_val_score(lasso2, Shipping, TargetV, scoring="neg_root_mean_squared_error", cv=kfold)
print("Mean root mean squared error is: ",-np.round(lasso_results.mean(),2),"Standard deviation is: ", np.round(lasso_results.std(),2))

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


Mean root mean squared error is:  6600.31 Standard deviation is:  124.43


In [14]:
#Model 4 Decision Tree

cv_inner = KFold(n_splits=3, shuffle=True, random_state=7)

DT = DecisionTreeRegressor()


random_grid={"splitter":["best","random"],
            "max_depth" : [1,3,5,7,9,11,12],
            "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
            "min_weight_fraction_leaf":[0,0.5],
            "max_features":["auto","log2","sqrt",None],
            "max_leaf_nodes":[None,10,20,30]}


model2 = RandomizedSearchCV(estimator = DT, param_distributions = random_grid, n_iter = 100, cv = 3,random_state=7, n_jobs = -1)

cv_outer = KFold(n_splits=8, shuffle=True, random_state=7)

DT_results = cross_val_score(model2, Shipping, TargetV, scoring='neg_root_mean_squared_error', cv=cv_outer)
print("Mean root mean squared error is: ",-np.round(DT_results.mean(),2),"Standard deviation is: ", np.round(DT_results.std(),2))

Mean root mean squared error is:  5669.41 Standard deviation is:  248.71


In [15]:
model2.fit(Shipping,TargetV)
coefficients = model2.best_estimator_.feature_importances_
for i,v in enumerate(coefficients):
     print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.04685
Feature: 1, Score: 0.63765
Feature: 2, Score: 0.08942
Feature: 3, Score: 0.00265
Feature: 4, Score: 0.00398
Feature: 5, Score: 0.00000
Feature: 6, Score: 0.09334
Feature: 7, Score: 0.00136
Feature: 8, Score: 0.00045
Feature: 9, Score: 0.00790
Feature: 10, Score: 0.06771
Feature: 11, Score: 0.00850
Feature: 12, Score: 0.00059
Feature: 13, Score: 0.00519
Feature: 14, Score: 0.00021
Feature: 15, Score: 0.02930
Feature: 16, Score: 0.00476
Feature: 17, Score: 0.00015


In [16]:
#Model 5 Random Forest Regression

cv_inner = KFold(n_splits=3, shuffle=True, random_state=7)

RF = RandomForestRegressor()

random_grid={"n_estimators":[50,100,200,400,500,1000],
             "max_features": ["auto", "sqrt"],
             "max_depth":[1,3,5,7,9,11,12],
             "min_samples_split":[2, 5, 10],
             "min_samples_leaf": [1,2,3,4,5,6,7,8,9,10],
             "bootstrap": [True, False]}

model3 = RandomizedSearchCV(estimator = RF, param_distributions = random_grid, n_iter = 100, cv = 3,random_state=7, n_jobs = -1)

cv_outer = KFold(n_splits=8, shuffle=True, random_state=7)

RF_results = cross_val_score(model3, Shipping, TargetV.values.ravel(), scoring="neg_root_mean_squared_error", cv=cv_outer)
print("Mean root mean squared error is: ",-np.round(RF_results.mean(),2),"Standard deviation is: ", np.round(RF_results.std(),2))

Mean root mean squared error is:  4469.59 Standard deviation is:  318.33


In [17]:
# Model 6 K-Nearest Neighbours
cv_inner = KFold(n_splits=3, shuffle=True, random_state=7)

KNN = KNeighborsRegressor()


params={"leaf_size":[1,3],
        "n_neighbors":[2,3,4,5],
        "p":[1,2]}


model4 = GridSearchCV(KNN, params, scoring="neg_root_mean_squared_error", cv=cv_inner, refit=True)

cv_outer = KFold(n_splits=8, shuffle=True, random_state=7)

KNN_results = cross_val_score(model4, Shipping, TargetV, scoring="neg_root_mean_squared_error", cv=cv_outer)
print("Mean root mean squared error is: ",-np.round(KNN_results.mean(),2),"Standard deviation is: ", np.round(KNN_results.std(),2))


Mean root mean squared error is:  4563.67 Standard deviation is:  291.98


In [20]:
ModelResults= pd.DataFrame({'Method':["Linear Regression","Ridge","Lasso","Decision Tree","Random Forest","K-Nearest Neighbour"],
                   'RMSE':[-np.round(lr_results.mean(),2), -np.round(ridge_results.mean(),2),-np.round(lasso_results.mean(),2), -np.round(DT_results.mean(),2), -np.round(RF_results.mean(),2), -np.round(KNN_results.mean(),2)],
                   'Standard deviation':[np.round(lr_results.std(),2), np.round(ridge_results.std(),2),np.round(lasso_results.std(),2) , np.round(DT_results.std(),2), np.round(RF_results.std(),2), np.round(KNN_results.std(),2)]})
ModelResults

Unnamed: 0,Method,RMSE,Standard deviation
0,Linear Regression,6600.08,123.55
1,Ridge,6600.15,121.66
2,Lasso,6600.31,124.43
3,Decision Tree,5669.41,248.71
4,Random Forest,4469.59,318.33
5,K-Nearest Neighbour,4563.67,291.98
