In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import sklearn
from datetime import datetime

In [2]:
#Speichern Systemzeit
now = datetime.now()
now = now.strftime("%d/%m/%Y %H:%M:%S")

In [3]:
#Laden des Datensatzes
data = pd.read_excel(r'masterlist_total_extrakt.xlsx')
data = data.drop([data.columns[0]], axis='columns')

#Entferne Features
data = data.drop(['UNLOADING_mt','SETUP_mt', 'AVG_TEMPERATURE', 'AVG_VISIBILITY','Tiefgang', 'Breite'],axis=1)

In [4]:
#Zeitreihenplot definieren
from plotly.offline import plot 
import plotly.graph_objs as go 

def time_series_plot(y_test, predict, modelname):
    xv = list(range(1,len(y_test))) 
    
    trace_high = go.Scatter( 
        x=xv[1:150], 
        y=predict[1:150], 
        name = "Modellschätzung", 
        line = dict(color = '#008080'),
        opacity = 0.8) 
    
    trace_low = go.Scatter( 
        x=xv[1:150], 
        y=y_test[1:150], 
        name = "Beobachteter Wert", 
        line = dict(color = '#808080'), 
        opacity = 0.8) 
    
    data = [trace_high, trace_low] 
    
    layout = dict( 
        title = modelname + " Modellschätzung vs. beobachtete Werte", 
        yaxis=dict( 
            title='Dauer bis Endladeabschluss (min)'),
        xaxis=dict( 
            title='Anläufe in Validierungsstichprobe'),
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=0.01),
    ) 
    
    fig = dict(data=data, layout=layout) 
    plot(fig, filename=modelname)

In [5]:
#Sicherstellen, dass jede Ausführung die gleichen Zufallszahlen generiert.
np.random.seed(42)

## Stratified Sampling

In [6]:
#Schichtkriterium definieren. In diesem Fall sind die Schiffsklassen anhand ihrer TEU-Kapazität
data['size_cat'] = pd.cut(data['TEU'], bins=[0,1000,2500,10000, np.inf], labels=[1,2,3,4])

#Sampling durchführen
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=40)
for train_index, test_index in split.split(data, data["size_cat"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [7]:
#Schichtkriterium entfernen
for set_ in (strat_train_set, strat_test_set):
    set_.drop("size_cat", axis=1, inplace=True)

## Datenbereinigung

In [8]:
#Unabhängige und abhängige Variablen definieren
calls_3ft = strat_train_set.drop(['UNLOADING_TOTAL', 'AVG_WINDSPEED', 'AVG_RAINFALL'], axis=1)
calls_5ft = strat_train_set.drop('UNLOADING_TOTAL', axis=1)
calls_labels = strat_train_set['UNLOADING_TOTAL'].copy()

In [9]:
#Preprocessing Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

calls_num_5ft = calls_5ft.drop("Terminal", axis=1)
calls_num_3ft = calls_3ft.drop("Terminal", axis=1)

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

num_attribs_5ft = list(calls_num_5ft)
num_attribs_3ft = list(calls_num_3ft)
cat_attribs = ['Terminal']

full_pipeline_3ft = ColumnTransformer([
        ("num", num_pipeline, num_attribs_3ft),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

full_pipeline_5ft = ColumnTransformer([
        ("num", num_pipeline, num_attribs_5ft),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

calls_prepared_3ft = full_pipeline_3ft.fit_transform(calls_3ft)
calls_prepared_5ft = full_pipeline_5ft.fit_transform(calls_5ft)


#Export der Pipelines
preprocessors = dict({
                'full_pipeline_3ft': full_pipeline_3ft,
                'full_pipeline_5ft': full_pipeline_5ft
})

joblib.dump(preprocessors, open("../../webapp/full_pipeline.pkl", "wb"))

## Model Training and Evaluation


In [10]:
from sklearn.metrics import mean_squared_error

In [11]:
from sklearn.model_selection import cross_val_score

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

### Lineare Regression

In [12]:
#Model initialisieren und trainieren
from sklearn import linear_model
lin_reg = linear_model.LinearRegression()
lin_reg.fit(calls_prepared_3ft, calls_labels)

LinearRegression()

In [13]:
#Model evaluieren
lin_scores = cross_val_score(lin_reg, calls_prepared_3ft, calls_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [402.1191928  340.9517282  377.76845796 427.30968482 456.05173091
 502.2290868  347.18143608 517.06423937 377.3722231  424.74148716]
Mean: 417.27892671961644
Standard deviation: 57.4075027100363


In [14]:
#Model inkl. Metadaten exportieren
lr_model = dict({
    'model': lin_reg,
    'metadata': {
        'type': 'Linear Regression',
        'date': now,
        'metrics': {
            'Cross-val-RSME': lin_rmse_scores.mean()
        }
    }
})
joblib.dump(lr_model, open("../../webapp/total_lin_reg.pkl", "wb"))

### Entscheidungsbaum

In [15]:
#Modelle initialisieren und trainieren
from sklearn.tree import DecisionTreeRegressor

tree_reg_3ft = DecisionTreeRegressor(random_state=42)
tree_reg_3ft.fit(calls_prepared_3ft, calls_labels)
tree_reg_5ft = DecisionTreeRegressor(random_state=42)
tree_reg_5ft.fit(calls_prepared_5ft, calls_labels)

DecisionTreeRegressor(random_state=42)

In [16]:
#Model evaluieren (3ft)
tree_scores = cross_val_score(tree_reg_3ft, calls_prepared_3ft, calls_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores_3ft = np.sqrt(-tree_scores)
display_scores(tree_rmse_scores_3ft)

Scores: [388.57829465 314.35618055 367.92708738 433.78840536 452.01119901
 464.89443905 317.49520512 490.12148651 353.2522615  420.36215771]
Mean: 400.27867168510926
Standard deviation: 58.378262836088176


In [17]:
#Model evaluieren (5ft)
tree_scores = cross_val_score(tree_reg_5ft, calls_prepared_5ft, calls_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores_5ft = np.sqrt(-tree_scores)
display_scores(tree_rmse_scores_5ft)

Scores: [589.30994119 564.43116444 546.18807149 578.07902712 574.82265318
 559.44941213 470.9917809  661.19140166 575.67258628 542.62687446]
Mean: 566.2762912845444
Standard deviation: 44.64531623982507


In [18]:
#Model inkl. Metadaten exportieren
tree_model = dict({
    'model': tree_reg_3ft,
    'metadata': {
        'type': 'Entscheidungsbaum',
        'date': now,
        'metrics': {
            'Cross-val-RSME': tree_rmse_scores_3ft.mean()
        }
    }
})
joblib.dump(tree_model, open("../../webapp/total_tree_reg.pkl", "wb"))

### K-Neighbors Regressor

In [19]:
#Model initialisieren und trainieren
from sklearn.neighbors import KNeighborsRegressor
knn_reg_3ft = KNeighborsRegressor(n_neighbors=13, algorithm='brute')
knn_reg_3ft.fit(calls_prepared_3ft, calls_labels)
knn_reg_5ft = KNeighborsRegressor(n_neighbors=13, algorithm='brute')
knn_reg_5ft.fit(calls_prepared_5ft, calls_labels)

KNeighborsRegressor(algorithm='brute', n_neighbors=13)

In [20]:
#Model evaluieren (3ft)
knn_scores = cross_val_score(knn_reg_3ft, calls_prepared_3ft, calls_labels, scoring="neg_mean_squared_error", cv=10)
knn_rmse_scores_3ft = np.sqrt(-knn_scores)
display_scores(knn_rmse_scores_3ft)

Scores: [386.68485281 323.76626463 384.24333535 413.82938291 456.31456346
 466.21905702 310.88639428 488.58118536 376.4894771  426.1280311 ]
Mean: 403.3142544022795
Standard deviation: 55.58608677880107


In [21]:
#Model evaluieren (5ft)
knn_scores = cross_val_score(knn_reg_5ft, calls_prepared_5ft, calls_labels, scoring="neg_mean_squared_error", cv=10)
knn_rmse_scores_5ft = np.sqrt(-knn_scores)
display_scores(knn_rmse_scores_5ft)

Scores: [422.51251416 354.26840662 388.26890009 437.64418509 463.40828096
 511.7129713  347.05164799 528.48378727 387.05850549 440.21653372]
Mean: 428.0625732689249
Standard deviation: 58.20195581313722


In [22]:
#Model inkl. Metadaten exportieren
knn_model = dict({
    'model': knn_reg_3ft,
    'metadata': {
        'type': 'K-Neighbors Regression',
        'date': now,
        'metrics': {
            'Cross-val-RSME': knn_rmse_scores_3ft.mean()
        }
    }
})
joblib.dump(knn_model, "../../webapp/total_knn_reg.joblib", compress=3)

['../../webapp/total_knn_reg.joblib']

### Random Forest Regressor

In [23]:
#Model initialisieren und trainieren
from sklearn.ensemble import RandomForestRegressor

forest_reg_3ft = RandomForestRegressor(random_state=42)
forest_reg_3ft.fit(calls_prepared_3ft, calls_labels)
forest_reg_5ft = RandomForestRegressor(random_state=42)
forest_reg_5ft.fit(calls_prepared_5ft, calls_labels)

RandomForestRegressor(random_state=42)

In [24]:
#Model evaluieren (3ft)
forest_scores = cross_val_score(forest_reg_3ft, calls_prepared_3ft, calls_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores_3ft = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores_3ft)

Scores: [387.07117914 309.89864765 365.51228811 426.36824338 448.98474222
 466.93335699 314.50978652 487.35037179 351.54086063 414.99598546]
Mean: 397.3165461889447
Standard deviation: 58.727758734771115


In [25]:
#Model evaluieren (5ft)
forest_scores = cross_val_score(forest_reg_5ft, calls_prepared_5ft, calls_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores_5ft = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores_5ft)

Scores: [385.71041998 382.12349407 403.40963545 431.7208966  477.29565251
 493.14746301 369.52249662 516.34135601 406.48930653 444.20354691]
Mean: 430.9964267694033
Standard deviation: 48.005353328799806


In [26]:
#Model inkl. Metadaten exportieren
forest_model = dict({
    'model': forest_reg_3ft,
    'metadata': {
        'type': 'Random Forest',
        'date': now,
        'metrics': {
            'Cross-val-RSME': forest_rmse_scores_3ft.mean()
        }
    }
})
joblib.dump(forest_model, open("../../webapp/total_forest_reg.pkl", "wb"))

### Gradient Boosting

In [27]:
#Model initialisieren und trainieren
from sklearn.ensemble import GradientBoostingRegressor

boost_reg_3ft = GradientBoostingRegressor()
boost_reg_3ft.fit(calls_prepared_3ft, calls_labels)
boost_reg_5ft = GradientBoostingRegressor()
boost_reg_5ft.fit(calls_prepared_5ft, calls_labels)

GradientBoostingRegressor()

In [28]:
#Model evaluieren (3ft)
boost_scores = cross_val_score(boost_reg_3ft, calls_prepared_3ft, calls_labels, scoring="neg_mean_squared_error", cv=10)
boost_rmse_scores_3ft = np.sqrt(-boost_scores)
display_scores(boost_rmse_scores_3ft)

Scores: [380.22558856 306.88461924 359.13260136 413.17243472 437.33268099
 468.73932697 310.51907691 486.03422999 348.78150519 414.74817047]
Mean: 392.55702343970063
Standard deviation: 58.92331542160704


In [29]:
#Model evaluieren (5ft)
boost_scores = cross_val_score(boost_reg_5ft, calls_prepared_5ft, calls_labels, scoring="neg_mean_squared_error", cv=10)
boost_rmse_scores_5ft = np.sqrt(-boost_scores)
display_scores(boost_rmse_scores_5ft)

Scores: [371.23903629 306.55429502 364.00299241 402.67275048 444.02710578
 472.03141993 324.72702088 480.41394562 354.11617922 417.11797991]
Mean: 393.6902725541212
Standard deviation: 56.67607973588272


In [30]:
#Model inkl. Metadaten exportieren
boost_model = dict({
    'model': boost_reg_3ft,
    'metadata': {
        'type': 'Gradient Boosting',
        'date': now,
        'metrics': {
            'Cross-val-RSME': boost_rmse_scores_3ft.mean()
        }
    }
})
joblib.dump(boost_model, open("../../webapp/total_boost_reg.pkl", "wb"))

### Neural Network

In [31]:
#Model initialisieren und trainieren
from sklearn.neural_network import MLPRegressor

nn_reg_3ft = MLPRegressor(batch_size=128, max_iter = 1500, learning_rate_init=0.001)
nn_reg_3ft.fit(calls_prepared_3ft, calls_labels)

MLPRegressor(batch_size=128, max_iter=1500)

In [32]:
nn_reg_5ft = MLPRegressor(batch_size=128, max_iter = 1500, learning_rate_init=0.001)
nn_reg_5ft.fit(calls_prepared_5ft, calls_labels)

MLPRegressor(batch_size=128, max_iter=1500)

In [33]:
#Model evaluieren (3ft)
nn_scores = cross_val_score(nn_reg_3ft, calls_prepared_3ft, calls_labels, scoring="neg_mean_squared_error", cv=5)
nn_rmse_scores_3ft = np.sqrt(-nn_scores)
display_scores(nn_rmse_scores_3ft)

Scores: [369.39158304 401.78862892 478.28876533 438.38448113 400.20333815]
Mean: 417.6113593145511
Standard deviation: 37.39469216905511


In [34]:
#Model evaluieren (5ft)
nn_scores = cross_val_score(nn_reg_5ft, calls_prepared_5ft, calls_labels, scoring="neg_mean_squared_error", cv=5)
nn_rmse_scores_5ft = np.sqrt(-nn_scores)
display_scores(nn_rmse_scores_5ft)

Scores: [370.51288101 405.28677035 477.87763895 438.59497074 400.67392775]
Mean: 418.58923776015615
Standard deviation: 36.67313435970404


In [35]:
#Model inkl. Metadaten exportieren
nn_model = dict({
    'model': nn_reg_3ft,
    'metadata': {
        'type': 'Neural Network',
        'date': now,
        'metrics': {
            'Cross-val-RSME': nn_rmse_scores_3ft.mean()
        }
    }
})
joblib.dump(nn_model, open("../../webapp/total_nn_reg.pkl", "wb"))

## Model Tuning (Random Forest)

In [36]:
from sklearn.model_selection import GridSearchCV

param_grid ={
    'bootstrap': [True],
    'max_depth': [70,80],
    'max_features': [2, 3],
    'min_samples_leaf': [4, 5],
    'min_samples_split': [12, 14],
    'n_estimators': [900, 1000]
}

forest_reg = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(forest_reg, param_grid, cv=3,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(calls_prepared_3ft, calls_labels)


GridSearchCV(cv=3, estimator=RandomForestRegressor(random_state=42),
             param_grid={'bootstrap': [True], 'max_depth': [70, 80],
                         'max_features': [2, 3], 'min_samples_leaf': [4, 5],
                         'min_samples_split': [12, 14],
                         'n_estimators': [900, 1000]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [37]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 70,
 'max_features': 3,
 'min_samples_leaf': 5,
 'min_samples_split': 14,
 'n_estimators': 1000}

In [38]:
final_model1 = grid_search.best_estimator_

X_test = strat_test_set.drop("UNLOADING_TOTAL", axis=1)
y_test = strat_test_set["UNLOADING_TOTAL"].copy()

X_test_prepared = full_pipeline_3ft.transform(X_test)
final_predictions = final_model1.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse


Given feature/column names or counts do not match the ones for the data given during fit. This will fail from v0.24.



344.26978152976363

In [39]:
#Model inkl. Metadaten exportieren
final_model = dict({
    'model': final_model1,
    'metadata': {
        'type': 'Random Forest',
        'date': now,
        'metrics': {
            'Cross-val-RSME': final_rmse
        }
    }
})
joblib.dump(final_model, open("../../webapp/total_final_reg.pkl", "wb"))

In [40]:
time_series_plot(y_test, final_predictions, "Random_Forest")


Your filename `Random_Forest` didn't end with .html. Adding .html to the end of your file.



## Model Tuning (Gradient Boosting)

In [41]:
param_grid ={
    'max_depth': [80],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4],
    'min_samples_split': [12, 14],
    'n_estimators': [1000]
}

boost_reg = GradientBoostingRegressor()
grid_search = GridSearchCV(boost_reg, param_grid, cv=3,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(calls_prepared_3ft, calls_labels)


GridSearchCV(cv=3, estimator=GradientBoostingRegressor(),
             param_grid={'max_depth': [80], 'max_features': [2, 3],
                         'min_samples_leaf': [3, 4],
                         'min_samples_split': [12, 14],
                         'n_estimators': [1000]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [42]:
grid_search.best_params_

{'max_depth': 80,
 'max_features': 2,
 'min_samples_leaf': 3,
 'min_samples_split': 14,
 'n_estimators': 1000}

In [43]:
final_model2 = grid_search.best_estimator_

X_test_prepared = full_pipeline_3ft.transform(X_test)
final_predictions = final_model2.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse


Given feature/column names or counts do not match the ones for the data given during fit. This will fail from v0.24.



349.3753376578678

In [44]:
time_series_plot(y_test, final_predictions, "Gradient_boosting")


Your filename `Gradient_boosting` didn't end with .html. Adding .html to the end of your file.

