In [1]:
# Import packages that are useful
import hopsworks
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
import joblib
import os
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
import os
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Read the data from incidents.csv and print the number of incidents per day
df = pd.read_csv("incidents-v1.csv")

# Drop the features with too many missing values
df = df.drop(columns=["apparent_temperature", "date", "dew_point_2m", "is_day", "precipitation", "et0_fao_evapotranspiration", "rain", "snow_depth", "snowfall", "soil_temperature_0_to_7cm", "weather_code", "wind_speed_10m", "surface_pressure", "sunshine_duration", "relative_humidity_2m", "temperature_2m", "vapour_pressure_deficit", "Unnamed: 0.4", "Unnamed: 0.3", "Unnamed: 0.2", "Unnamed: 0.1", "Unnamed: 0"])

# Remove rows with no end time
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1110 entries, 0 to 1137
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                1110 non-null   object 
 1   code              1110 non-null   int64  
 2   description       1110 non-null   object 
 3   endTime           1110 non-null   object 
 4   hour              1110 non-null   int64  
 5   iconCategory      1110 non-null   int64  
 6   latitude          1110 non-null   float64
 7   longitude         1110 non-null   float64
 8   magnitudeOfDelay  1110 non-null   int64  
 9   month             1110 non-null   int64  
 10  startTime         1110 non-null   object 
 11  type              1110 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 112.7+ KB


In [4]:
# Get the labels by getting the difference between start and end time
df['duration'] = pd.to_datetime(df["endTime"]) - pd.to_datetime(df["startTime"])

# Remove the start time and end time columns
df = df.drop(["startTime", "endTime", "type", "id", "description"], axis=1)


In [5]:
# use the label encoder to encode the description feature
# le = LabelEncoder()
# le.fit(df["description"])
# df["description"] = le.transform(df["description"])

# Remove the id column
#df = df.drop("id", axis=1)

df.describe()

Unnamed: 0,code,hour,iconCategory,latitude,longitude,magnitudeOfDelay,month,duration
count,1110.0,1110.0,1110.0,1110.0,1110.0,1110.0,1110.0,1110
mean,197.289189,13.756757,6.096396,59.319866,18.057985,1.85045,11.905405,11 days 14:13:02.545045045
std,255.011017,5.095333,1.348507,0.036881,0.041765,1.068094,0.808581,114 days 00:42:07.801110800
min,101.0,0.0,1.0,59.198197,17.870475,0.0,1.0,0 days 00:08:00
25%,101.0,11.0,6.0,59.296146,18.025949,1.0,12.0,0 days 01:05:06.250000
50%,108.0,14.0,6.0,59.32093,18.057546,2.0,12.0,0 days 01:08:36.500000
75%,115.0,17.0,6.0,59.34515,18.088704,3.0,12.0,0 days 01:22:46.750000
max,1472.0,23.0,14.0,59.446342,18.287681,4.0,12.0,1617 days 01:00:00


In [6]:
# Convert the duration to seconds
df["duration"] = df["duration"].dt.total_seconds()

# Use a log scale for the duration
df["duration"] = df["duration"].astype("int64")
df["duration"] = np.log(df["duration"])
y = df["duration"]

# create the x values without dropping the duration in the df
X = df.drop("duration", axis=1)


In [7]:
# Try a linear regression model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

# train and test the model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.8720942193041452

In [8]:
import hopsworks

# Log in to Hopsworks
project = hopsworks.login()
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/194708
Connected. Call `.close()` to terminate connection gracefully.


In [10]:
# Make the columns lowercase 
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')

# Create the feature group
# fg = fs.create_feature_group("incidents", version=1, description="Incidents in the city of Stockholm", 
#                              primary_key=df.columns.tolist(),)
fg = fs.get_feature_group("incidents", version=1)
fg.insert(df)

Uploading Dataframe: 100.00% |██████████| Rows 1110/1110 | Elapsed Time: 00:05 | Remaining Time: 00:00


Launching job: incidents_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/194708/jobs/named/incidents_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x21680730f50>, None)

In [11]:
query = fg.select_all()
feature_view = fs.get_or_create_feature_view("incidents_view", version=1, description="Incidents in the city of Stockholm",  labels=["duration"], query=query)
X_train, X_val, X_test, y_train, y_val, y_test = feature_view.train_validation_test_split(0.1, 0.1)


Finished: Reading data from Hopsworks, using ArrowFlight (0.94s) 




In [12]:
# Train a linear regression model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

# train and test the model
lr.fit(X_train, y_train)
metric = lr.score(X_test, y_test)

In [13]:
# We will now upload the linear regression model to the registry
mr = project.get_model_registry()

# The directory will be wine_model
model_dir="stockholm_incidents_model"
if os.path.isdir(model_dir) == False:
    os.mkdir(model_dir)

# Save the model
joblib.dump(lr, model_dir + "/stckhlm_inc_model.pkl")

# Specify the input and output
input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema, output_schema)

# Create an entry
wine_model = mr.python.create_model(
    name="stockholm_incidents_model", 
    metrics={"MSE" : metric},
    model_schema=model_schema,
    description="Stockholm Incident duration Predictor"
)

# Upload the model to the model registry, including all files in 'model_dir'
wine_model.save(model_dir)

Connected. Call `.close()` to terminate connection gracefully.


Uploading: 100.000%|██████████| 1071/1071 elapsed<00:01 remaining<00:00:01,  3.27it/s]
Uploading: 100.000%|██████████| 714/714 elapsed<00:01 remaining<00:00<00:04,  1.10s/it]
Model export complete: 100%|██████████| 6/6 [00:09<00:00,  1.55s/it]                   

Model created, explore it at https://c.app.hopsworks.ai:443/p/194708/models/stockholm_incidents_model/1





Model(name: 'stockholm_incidents_model', version: 1)