In [2]:
# Import packages that are useful
import hopsworks
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
import joblib
import os
import pandas as pd
import os
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Log into hopsworks
project = hopsworks.login()
fs = project.get_feature_store()


Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/194708
Connected. Call `.close()` to terminate connection gracefully.


In [4]:
# Get the feature store
incidents_fg = fs.get_feature_group(name="sthlm_incidents",version=1)
query = incidents_fg.select_all()
feature_view = fs.get_or_create_feature_view(name="incidents_view", version=1, query=query, description="Incidents view")

In [5]:
# Create a train, test and validation set from the feature view
X_train, X_val, X_test, y_train, y_val, y_test = feature_view.train_validation_test_split(test_size=0.1, validation_size=0.1)

Finished: Reading data from Hopsworks, using ArrowFlight (0.95s) 




# Linear Regression Model

In [6]:
# Try a linear regression model
lr = LinearRegression()

# train and test the model
lr.fit(X_train, y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_val, y_val))

0.732820871775085
0.8437784080417212


# Multi Layer Perceptron Model

In [16]:
# Try a Multi-layer Perceptron regressor
mlp = MLPRegressor()
mlp.fit(X_train, y_train)
print(mlp.score(X_train, y_train))
print(mlp.score(X_val, y_val))



0.07927534095425093
0.27072706950530145


# Random Forest Model

In [17]:
# Try a Random Forest regressor
rf = RandomForestRegressor()

rf.fit(X_train, y_train)
print(rf.score(X_train, y_train))
print(rf.score(X_val, y_val))



0.9760640949627704
0.9558151575290581


# Gradient Boosting Model

In [18]:
# Try a Gradient Boosting regressor
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
print(gb.score(X_train, y_train))
print(gb.score(X_val, y_val))

0.950046451329132
0.9400185188205564




# Decision Tree Model

In [19]:
# Try a Decision Tree regressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
print(dt.score(X_train, y_train))
print(dt.score(X_val, y_val))

0.9987207476706265
0.9259397204019767


# K-Nearest Neighbors regressor


In [20]:
# Try a K-Nearest Neighbors regressor
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
print(knn.score(X_train, y_train))
print(knn.score(X_val, y_val))

0.722523875115411
0.3344656163652052


# Support Vector Machine regressor

In [23]:
# Try a Support Vector Machine regressor
svr = SVR()
svr.fit(X_train, y_train)
print(svr.score(X_train, y_train))
print(svr.score(X_val, y_val))

0.23694539972620476
0.04752149744597123




# Xgboost regressor


In [22]:
# Try a XGBoost regressor
xgb = XGBRegressor()

xgb.fit(X_train, y_train)
print(xgb.score(X_train, y_train))
print(xgb.score(X_val, y_val))

0.9966180591833564
0.9481705023780881


# We look at the best score on training and validation, making sure that it was not overfitted.

In [24]:
# Evaluate the mlp model on the test set
svr.score(X_test, y_test)

0.222158355945844

In [27]:
# We put it in the model registry
model = svr
mr = project.get_model_registry()

# Create a schema for the model
model_dir="sthlm_incidents_model"
if os.path.isdir(model_dir) == False:
    os.mkdir(model_dir)

# Save it on the disk 
joblib.dump(model, model_dir + "/sthlm_model.pkl")

# Specify the schema of the model's input/output using the features (X_train) and labels (y_train)
input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema, output_schema)

# Create an entry in the model registry that includes the model's name, desc, metrics
incidents_model = mr.python.create_model(
    name="sthlm_incidents_model", 
    metrics={"MSE" : svr.score(X_test, y_test)},
    model_schema=model_schema,
    description="Stockholm Incident Duration Model"
)

# Upload the model to the model registry, including all files in 'model_dir'
incidents_model.save(model_dir)

Connected. Call `.close()` to terminate connection gracefully.


Uploading: 100.000%|██████████| 36740/36740 elapsed<00:01 remaining<00:001,  2.92it/s]
Uploading: 100.000%|██████████| 714/714 elapsed<00:01 remaining<00:00<00:03,  1.00it/s]
Model export complete: 100%|██████████| 6/6 [00:08<00:00,  1.48s/it]                   

Model created, explore it at https://c.app.hopsworks.ai:443/p/194708/models/sthlm_incidents_model/1





Model(name: 'sthlm_incidents_model', version: 1)

In [None]:
# use the label encoder to encode the description feature
# le = LabelEncoder()
# le.fit(df["description"])
# df["description"] = le.transform(df["description"])

# Remove the id column
#df = df.drop("id", axis=1)

df.describe()

Unnamed: 0,code,hour,iconCategory,latitude,longitude,magnitudeOfDelay,month,duration
count,1110.0,1110.0,1110.0,1110.0,1110.0,1110.0,1110.0,1110
mean,197.289189,13.756757,6.096396,59.319866,18.057985,1.85045,11.905405,11 days 14:13:02.545045045
std,255.011017,5.095333,1.348507,0.036881,0.041765,1.068094,0.808581,114 days 00:42:07.801110800
min,101.0,0.0,1.0,59.198197,17.870475,0.0,1.0,0 days 00:08:00
25%,101.0,11.0,6.0,59.296146,18.025949,1.0,12.0,0 days 01:05:06.250000
50%,108.0,14.0,6.0,59.32093,18.057546,2.0,12.0,0 days 01:08:36.500000
75%,115.0,17.0,6.0,59.34515,18.088704,3.0,12.0,0 days 01:22:46.750000
max,1472.0,23.0,14.0,59.446342,18.287681,4.0,12.0,1617 days 01:00:00


In [None]:
# Convert the duration to seconds
df["duration"] = df["duration"].dt.total_seconds()

# Use a log scale for the duration
df["duration"] = df["duration"].astype("int64")
df["duration"] = np.log(df["duration"])
y = df["duration"]

# create the x values without dropping the duration in the df
X = df.drop("duration", axis=1)


In [None]:
# Try a linear regression model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

# train and test the model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.8720942193041452

In [None]:
import hopsworks

# Log in to Hopsworks
project = hopsworks.login()
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/194708
Connected. Call `.close()` to terminate connection gracefully.


In [None]:
# Make the columns lowercase 
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')

# Create the feature group
# fg = fs.create_feature_group("incidents", version=1, description="Incidents in the city of Stockholm", 
#                              primary_key=df.columns.tolist(),)
fg = fs.get_feature_group("incidents", version=1)
fg.insert(df)

Uploading Dataframe: 100.00% |██████████| Rows 1110/1110 | Elapsed Time: 00:05 | Remaining Time: 00:00


Launching job: incidents_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/194708/jobs/named/incidents_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x21680730f50>, None)

In [None]:
query = fg.select_all()
feature_view = fs.get_or_create_feature_view("incidents_view", version=1, description="Incidents in the city of Stockholm",  labels=["duration"], query=query)
X_train, X_val, X_test, y_train, y_val, y_test = feature_view.train_validation_test_split(0.1, 0.1)


Finished: Reading data from Hopsworks, using ArrowFlight (0.94s) 




In [None]:
# Train a linear regression model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

# train and test the model
lr.fit(X_train, y_train)
metric = lr.score(X_test, y_test)

In [None]:
# We will now upload the linear regression model to the registry
mr = project.get_model_registry()

# The directory will be wine_model
model_dir="stockholm_incidents_model"
if os.path.isdir(model_dir) == False:
    os.mkdir(model_dir)

# Save the model
joblib.dump(lr, model_dir + "/stckhlm_inc_model.pkl")

# Specify the input and output
input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema, output_schema)

# Create an entry
wine_model = mr.python.create_model(
    name="stockholm_incidents_model", 
    metrics={"MSE" : metric},
    model_schema=model_schema,
    description="Stockholm Incident duration Predictor"
)

# Upload the model to the model registry, including all files in 'model_dir'
wine_model.save(model_dir)

Connected. Call `.close()` to terminate connection gracefully.


Uploading: 100.000%|██████████| 1071/1071 elapsed<00:01 remaining<00:00:01,  3.27it/s]
Uploading: 100.000%|██████████| 714/714 elapsed<00:01 remaining<00:00<00:04,  1.10s/it]
Model export complete: 100%|██████████| 6/6 [00:09<00:00,  1.55s/it]                   

Model created, explore it at https://c.app.hopsworks.ai:443/p/194708/models/stockholm_incidents_model/1





Model(name: 'stockholm_incidents_model', version: 1)