In [78]:
# Import packages that are useful
import hopsworks
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
import joblib
import os
import pandas as pd
import os
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor


In [79]:
# Log into hopsworks
project = hopsworks.login()
fs = project.get_feature_store()


Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/194708
Connected. Call `.close()` to terminate connection gracefully.


In [80]:
# Get the feature store
incidents_fg = fs.get_feature_group(name="sthlm_incidents", version=2)
query = incidents_fg.select_all()
feature_view = fs.get_or_create_feature_view(name="incidents_view", version=2, query=query, description="Incidents view", labels=["duration"])

In [81]:
# Create a train, test and validation set from the feature view
df_train, df_val, df_test, _, _, _ = feature_view.train_validation_test_split(test_size=0.1, validation_size=0.1)

# Drop the id from the dataframes
df_train = df_train.drop(columns=["id", "description", "starttime", "endtime", "type"])
df_val = df_val.drop(columns=["id", "description", "starttime", "endtime", "type"])
df_test = df_test.drop(columns=["id", "description", "starttime", "endtime", "type"])

Finished: Reading data from Hopsworks, using ArrowFlight (1.07s) 




In [82]:
df_train.head()

Unnamed: 0,code,hour,iconcategory,latitude,longitude,magnitudeofdelay,month,duration
1,122,21,6,59.344153,18.023423,1,12,4574.0
2,101,11,6,59.333028,18.07741,3,12,4017.0
3,101,12,6,59.262366,18.099725,3,12,3995.0
4,122,22,6,59.384649,18.005546,1,12,3886.0
5,101,11,6,59.351886,18.106963,3,12,3899.0


In [83]:
y_train = df_train["duration"]
X_train = df_train.drop(["duration"], axis=1)
y_val = df_val["duration"]
X_val = df_val.drop(["duration"], axis=1)
y_test = df_test["duration"]
X_test = df_test.drop(["duration"], axis=1)


# Linear Regression Model

In [84]:
# Try a linear regression model
lr = LinearRegression()

# train and test the model
lr.fit(X_train, y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_val, y_val))

0.1308020874977297
0.09165995296249863


# Multi Layer Perceptron Model

In [85]:
# Try a Multi-layer Perceptron regressor
mlp = MLPRegressor()
mlp.fit(X_train, y_train)
print(mlp.score(X_train, y_train))
print(mlp.score(X_val, y_val))

-0.17911097270767562
-0.14510071186552453




# Random Forest Model

In [86]:
# Try a Random Forest regressor
rf = RandomForestRegressor()

rf.fit(X_train, y_train)
print(rf.score(X_train, y_train))
print(rf.score(X_val, y_val))

0.939421954184228
0.6995473750117636


# Gradient Boosting Model

In [87]:
# Try a Gradient Boosting regressor
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
print(gb.score(X_train, y_train))
print(gb.score(X_val, y_val))

0.8414875836384754
0.7421413622661746


# Decision Tree Model

In [88]:
# Try a Decision Tree regressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
print(dt.score(X_train, y_train))
print(dt.score(X_val, y_val))

0.9980388543772414
0.6006664354724507


# K-Nearest Neighbors regressor


In [89]:
# Try a K-Nearest Neighbors regressor
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
print(knn.score(X_train, y_train))
print(knn.score(X_val, y_val))

0.7020443316305236
0.7296828673269338


# Support Vector Machine regressor

In [90]:
# Try a Support Vector Machine regressor
svr = SVR()
svr.fit(X_train, y_train)
print(svr.score(X_train, y_train))
print(svr.score(X_val, y_val))

-0.08941954884330694
-0.06965762161338596


# Xgboost regressor


In [91]:
# Try a XGBoost regressor
xgb = XGBRegressor()

xgb.fit(X_train, y_train)
print(xgb.score(X_train, y_train))
print(xgb.score(X_val, y_val))

0.9937241755350347
0.6818003644760795


# We look at the best score on training and validation, making sure that it was not overfitted.

In [92]:
# Evaluate the mlp model on the test set
svr.score(X_test, y_test)

-0.09878066875256253

In [93]:
# We put it in the model registry
model = svr
mr = project.get_model_registry()

# Create a schema for the model
model_dir="sthlm_incidents_model"
if os.path.isdir(model_dir) == False:
    os.mkdir(model_dir)

# Save it on the disk 
joblib.dump(model, model_dir + "/sthlm_model.pkl")

# Specify the schema of the model's input/output using the features (X_train) and labels (y_train)
input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema, output_schema)

# Create an entry in the model registry that includes the model's name, desc, metrics
incidents_model = mr.python.create_model(
    name="sthlm_incidents_model", 
    metrics={"MSE" : svr.score(X_test, y_test)},
    model_schema=model_schema,
    description="Stockholm Incident Duration Model"
)

# Upload the model to the model registry, including all files in 'model_dir'
incidents_model.save(model_dir)

Connected. Call `.close()` to terminate connection gracefully.


Uploading: 100.000%|██████████| 58932/58932 elapsed<00:01 remaining<00:001,  3.21it/s]
Uploading: 100.000%|██████████| 714/714 elapsed<00:01 remaining<00:00<00:03,  1.08it/s]
Model export complete: 100%|██████████| 6/6 [00:08<00:00,  1.46s/it]                   

Model created, explore it at https://c.app.hopsworks.ai:443/p/194708/models/sthlm_incidents_model/3





Model(name: 'sthlm_incidents_model', version: 3)