In [1]:
import os
import sys
import logging
import pickle
import mlflow
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import SGDRegressor, Ridge, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Activation, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from warnings import filterwarnings
filterwarnings('ignore')

from minio import Minio

# settings
MINIO_HOST = os.environ.get('MINIO_HOST', '')
MINIO_ACCESS_KEY = os.environ.get('MINIO_ACCESS_KEY', '')
MINIO_SECRET_KEY = os.environ.get('MINIO_SECRET_KEY', '')

os.environ['MLFLOW_TRACKING_URI'] = ""
os.environ['MLFLOW_S3_ENDPOINT_URL'] = ""
os.environ['AWS_ACCESS_KEY_ID'] = ''
os.environ['AWS_SECRET_ACCESS_KEY'] = ''

In [2]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
                    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
log = logging.getLogger(__name__)

In [3]:
class VesselMEFuelConsumptionRegression(object):
    def __init__(self):
        self.random_state = 42
        self.bucket = 'uploads'
        self.directory = 'data'
        self.features_scaler = MinMaxScaler()
        self.y_scaler = MinMaxScaler()
        
        self.date_col = ['measurement_time']
        
        self.numerical_columns = [
             'cyl_chargeair_press', 'AE_FO_inlet_flow',
             'draught_aft_side', 'AE_FO_inlet_Temp',
             'engine_speed','DG_1_power',
             'DG_2_power','DG_4_power',
             'CAC_CW_HT_pressure', 'CAC_in_Low_Temperature_CW_temp',
             'propeller_shaft_output', 'propeller_shaft_rpm',
             'propeller_shaft_thrust', 'cyl_chargeair_temp',
             'ship_speed_actual', 'Ship_SpeedLOG', 
             'cyl_exh_gas_temp_mean','torque',
             'AE_FO_outlet_flow', 'AE_FO_outlet_Temp',
             'Eng_in_HTCW_press', 'Eng_in_Jacket_HTCW_temp',
             'Eng_out_Jacket_HTCW_temp', 'Eng_Relative_load',
             'FO_Rack_position', 'FO_inlet_press',
             'fueloil_inlet_temperature', 'ME_FO_inlet_flow',
             'ME_FO_outlet_Temp', 'ME_FO_outlet_flow',
             'LO_Filter_P', 'LO_filter_in_press',
             'LO_in_press', 'LO_in_temp',
             'LO_out_temp_TC', 'LO_cooler_CW_out_temp'
        ]
        self.categorical_columns = [
            'DG_1_condition','DG_2_condition',
            'DG_3_condition','DG_4_condition',
            'ship_inclination'
        ]
        self.monitoring_col_name = 'ME_FO_consumption'
        self.monitoring_col = [self.monitoring_col_name]
        
        self.columns_used = self.date_col + self.numerical_columns + self.categorical_columns + self.monitoring_col
        self.for_normalization_cols = self.numerical_columns
        
        
    
    def load_dataset_(self, ship_id):
        client = Minio(
            MINIO_HOST,
            access_key=MINIO_ACCESS_KEY,
            secret_key=MINIO_SECRET_KEY,
            secure=False
        )
        data_path = f'{self.directory}/{ship_id}.csv'
        obj = client.get_object(self.bucket, data_path)
        df = pd.read_csv(obj, parse_dates=self.date_col, usecols=self.columns_used)
        return df
    
    
    def _preprocess(self, df):
        df = df.dropna()
        df[self.for_normalization_cols] = self.features_scaler.fit_transform(
            df[self.for_normalization_cols]
        )
        df[self.monitoring_col] = self.y_scaler.fit_transform(
            df[self.monitoring_col]
        )
        # remove zero variance data
        df = df.loc[:, (df != df.iloc[0]).any()]
        return df
    
    def _store_scalers(self,
                       features_sc_path='ann_feature_scaler.pkl',
                       y_sc_path='ann_y_scaler.pkl'
                      ):
        pickle.dump(self.features_scaler, open(features_sc_path, 'wb'))
        pickle.dump(self.y_scaler, open(y_sc_path, 'wb'))
    
    def _data_preparation(self, normalized_df, test_size=0.20):
        abort_cols = self.monitoring_col + self.date_col
        Y = normalized_df[self.monitoring_col]
        X = normalized_df[[column for column in normalized_df.columns if column not in abort_cols]]
        X_train,X_test,Y_train,Y_test=train_test_split(
            X,Y,test_size=test_size,random_state=self.random_state
        )
        self._store_scalers()
        return X_train,X_test,Y_train,Y_test

In [4]:
class ANNObject(object):
    
    def _construct(self, shape, activation_function='relu'):
        model = Sequential()
        model.add(Dense(shape, activation=activation_function))
        model.add(Dense(32, activation=activation_function))

        model.add(Dense(64, activation=activation_function))
        model.add(Dense(128, activation=activation_function))

        model.add(Dense(512, activation=activation_function))
        model.add(Dropout(0.1))
        
        model.add(Dense(1))
        return model
    
    def _fit_model_(self,
                    shape,
                    epochs,
                    X_train,
                    Y_train,
                    X_test,
                    Y_test,
                    batch_size=100,
                    patience=2
                   ):
        model = self._construct(shape=shape)
        model.compile(optimizer='adam', loss='MSE')
        
        early_stop = EarlyStopping(monitor='val_loss', patience=patience)
        model.fit(
            X_train,
            Y_train,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_test, Y_test), callbacks=[early_stop]
        )
        return model
    
    def _gen_metrics(self, true_value, predicted):
        mae = mean_absolute_error(true_value, predicted)
        mse = mean_squared_error(true_value, predicted)
        r2 = r2_score(true_value, predicted)
        
        log.info(f'ANN MAE: {mae}')
        log.info(f'ANN MSE: {mse}')
        log.info(f'ANN R2: {r2}')
        metrics_dict = {
            'mae': mae,
            'mse': mse,
            'r2': r2
        }
        return metrics_dict

In [5]:
class ANNMlFlowHandler(object):
    def __init__(self, experiment_name, model, model_name, metrics_dict):
        self.experiment_name = experiment_name
        self.model_name = model_name
        self.model = model
        self.metrics_dict = metrics_dict
        
    def _log_to_mlflow(self,
                       scaler_features_path='ann_feature_scaler.pkl',
                       scaler_y_path='ann_y_scaler.pkl'
                      ):
        mlflow.set_experiment(self.experiment_name)
        with mlflow.start_run():
            
            mlflow.log_artifact(scaler_features_path)
            mlflow.log_artifact(scaler_y_path)
            
            mlflow.keras.log_model(
                self.model,
                self.model_name,
                registered_model_name=self.model_name
            )
            
            for metrics_tuple in self.metrics_dict.items():
                mlflow.log_metric(metrics_tuple[0], metrics_tuple[1])

In [6]:
regr_obj = VesselMEFuelConsumptionRegression()
data = regr_obj.load_dataset_(ship_id='ship_1')

processed_data = regr_obj._preprocess(df=data)

X_train, X_test, Y_train, Y_test = regr_obj._data_preparation(normalized_df=processed_data)

In [7]:
ann_obj = ANNObject()
features_len = X_train.shape[1]
test_rows_len = X_test.shape[0]

ann_model = ann_obj._fit_model_(
    shape=features_len,
    epochs=10,
    X_train=X_train,
    X_test=X_test,
    Y_train=Y_train,
    Y_test=Y_test
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
predicted = ann_model.predict(X_test)
predictions_reshaped = predicted.reshape((test_rows_len))
metrics_dict = ann_obj._gen_metrics(Y_test, predictions_reshaped)

2022-11-30 21:04:06,178 - __main__ - INFO - ANN MAE: 0.007132662701780962
2022-11-30 21:04:06,178 - __main__ - INFO - ANN MSE: 0.00010874991288671
2022-11-30 21:04:06,179 - __main__ - INFO - ANN R2: 0.9990043787422427


In [9]:
experiment_name='fuel_consumption_regression_ann'
model_name='ann_regressor'
mlflow_obj = ANNMlFlowHandler(
    experiment_name=experiment_name,
    model=ann_model,
    model_name=model_name,
    metrics_dict=metrics_dict
)
mlflow_obj._log_to_mlflow()

2022-11-30 21:04:12,122 - botocore.credentials - INFO - Found credentials in environment variables.
INFO:tensorflow:Assets written to: C:\Users\PKAPSA~1.EPU\AppData\Local\Temp\tmpaltlbqx_\model\data\model\assets
2022-11-30 21:04:13,418 - tensorflow - INFO - Assets written to: C:\Users\PKAPSA~1.EPU\AppData\Local\Temp\tmpaltlbqx_\model\data\model\assets


Registered model 'ann_regressor' already exists. Creating a new version of this model...
2022/11/30 21:04:29 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: ann_regressor, version 3
Created version '3' of model 'ann_regressor'.
