# Training pipeline

Creating a model to predict premier league player score with previous created feature group

# Imports

In [1]:
# imports
import os
from datetime import datetime, timedelta
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error, r2_score
import hopsworks

import warnings
warnings.filterwarnings("ignore")

## Connect to Hopsworks feature store

In [2]:
# If you haven't set the env variable 'HOPSWORKS_API_KEY', then uncomment the next line and enter your API key
with open('./hopsworks/hopsworks-api-key.txt', 'r') as file:
    os.environ["HOPSWORKS_API_KEY"] = file.read().rstrip()

project = hopsworks.login()
print(project.description)

2024-12-28 20:54:40,839 INFO: Initializing external client
2024-12-28 20:54:40,841 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-28 20:54:42,959 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1159321
Default project


In [3]:
fs = project.get_feature_store()

# Retrieve feature group
player_fg = fs.get_feature_group(
    name='player_features',
    version=1,
)

# Create feature view

In [5]:
# Select features
selected_features = player_fg.select_all()
selected_features.show(10)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.36s) 


Unnamed: 0,id,total_points,gameweek,prev_minutes,prev_goals_scored,prev_assists,prev_clean_sheets,prev_goals_conceded,prev_own_goals,prev_penalties_saved,...,prev_influence,prev_creativity,prev_threat,prev_ict_index,prev_starts,prev_expected_goals,prev_expected_assists,prev_expected_goal_involvements,prev_expected_goals_conceded,prev_in_dreamteam
0,74,3,18,90.0,0.0,1.0,1.0,0.0,0.0,0.0,...,37.2,21.6,8.0,6.7,1.0,0.0,0.26,0.26,2.26,False
1,410,1,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
2,208,0,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
3,184,1,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
4,659,0,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
5,340,0,13,90.0,0.0,0.0,0.0,4.0,0.0,0.0,...,11.4,4.1,11.0,2.7,1.0,0.08,0.03,0.11,2.51,False
6,388,2,4,90.0,0.0,0.0,0.0,3.0,0.0,0.0,...,20.4,30.8,0.0,5.1,1.0,0.0,0.05,0.05,3.0,False
7,327,1,14,90.0,0.0,1.0,1.0,0.0,0.0,0.0,...,2.0,13.1,4.0,1.9,1.0,0.0,0.11,0.11,0.77,False
8,377,2,16,25.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.2,10.5,0.0,1.8,0.0,0.0,0.01,0.01,0.0,False
9,586,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


In [6]:
# Create feature view
feature_view = fs.get_or_create_feature_view(
    name='player_score_fv',
    description="player data with score label",
    version=1,
    labels=['total_points'],
    query=selected_features,
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1159321/fs/1150024/fv/player_score_fv/version/1


# Split data into training and test set

In [7]:
X_train, X_test, y_train, y_test = feature_view.train_test_split(
    test_size=0.2
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.25s) 


In [8]:
X_train

Unnamed: 0,id,gameweek,prev_minutes,prev_goals_scored,prev_assists,prev_clean_sheets,prev_goals_conceded,prev_own_goals,prev_penalties_saved,prev_penalties_missed,...,prev_influence,prev_creativity,prev_threat,prev_ict_index,prev_starts,prev_expected_goals,prev_expected_assists,prev_expected_goal_involvements,prev_expected_goals_conceded,prev_in_dreamteam
0,74,18,90.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,37.2,21.6,8.0,6.7,1.0,0.00,0.26,0.26,2.26,False
1,410,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,False
4,659,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,False
5,340,13,90.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,11.4,4.1,11.0,2.7,1.0,0.08,0.03,0.11,2.51,False
7,327,14,90.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,2.0,13.1,4.0,1.9,1.0,0.00,0.11,0.11,0.77,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11387,629,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,False
11388,472,18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,False
11389,50,2,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.2,0.3,0.0,0.5,0.0,0.00,0.00,0.00,1.04,False
11390,590,17,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.2,10.3,0.0,1.2,0.0,0.00,0.02,0.02,0.00,False


In [9]:
# Drop the index columns - 'id' (event_time) and 'gameweek' (primary key)

train_features = X_train.drop(['id', 'gameweek'], axis=1)
test_features = X_test.drop(['id', 'gameweek'], axis=1)

In [10]:
y_train

Unnamed: 0,total_points
0,3
1,1
4,0
5,0
7,1
...,...
11387,0
11388,0
11389,1
11390,0


# Modeling

In [12]:
print(train_features.dtypes)

# Convert all object types to float
for col in train_features.columns:
    if train_features[col].dtype == 'object':
        train_features[col] = train_features[col].astype('float64')

for col in test_features.columns:
    if test_features[col].dtype == 'object':
        test_features[col] = test_features[col].astype('float64')

prev_minutes                       float64
prev_goals_scored                  float64
prev_assists                       float64
prev_clean_sheets                  float64
prev_goals_conceded                float64
prev_own_goals                     float64
prev_penalties_saved               float64
prev_penalties_missed              float64
prev_yellow_cards                  float64
prev_red_cards                     float64
prev_saves                         float64
prev_bonus                         float64
prev_bps                           float64
prev_influence                      object
prev_creativity                     object
prev_threat                         object
prev_ict_index                      object
prev_starts                        float64
prev_expected_goals                 object
prev_expected_assists               object
prev_expected_goal_involvements     object
prev_expected_goals_conceded        object
prev_in_dreamteam                     bool
dtype: obje

In [11]:
# Creating an instance of the XGBoost Regressor
xgb_regressor = XGBRegressor()

# Fitting the XGBoost Regressor to the training data
xgb_regressor.fit(train_features, y_train)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:prev_influence: object, prev_creativity: object, prev_threat: object, prev_ict_index: object, prev_expected_goals: object, prev_expected_assists: object, prev_expected_goal_involvements: object, prev_expected_goals_conceded: object

In [None]:
# Predicting target values on the test set
y_pred = xgb_regressor.predict(test_features)

# Calculating Mean Squared Error (MSE) using sklearn
mse = mean_squared_error(y_test.iloc[:,0], y_pred)
print("MSE:", mse)

# Calculating R squared using sklearn
r2 = r2_score(y_test.iloc[:,0], y_pred)
print("R squared:", r2)

In [None]:
df = y_test
df['predicted_score'] = y_pred

In [None]:
# Creating a directory for the model artifacts if it doesn't exist
model_dir = "player_score_model"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
images_dir = model_dir + "/images"
if not os.path.exists(images_dir):
    os.mkdir(images_dir)

In [None]:
from functions import util

file_path = images_dir + "/player_score_hindcast.png"
plt = util.plot_player_scores_forecast("FPL Player scores", df, file_path, hindcast=True)
plt.show()

In [None]:
# Plotting feature importances using the plot_importance function from XGBoost
plot_importance(xgb_regressor, max_num_features=4)
feature_importance_path = images_dir + "/feature_importance.png"
plt.savefig(feature_importance_path)
plt.show()

# Model registry

In [None]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

# Creating input and output schemas using the 'Schema' class for features (X) and target variable (y)
input_schema = Schema(X_train)
output_schema = Schema(y_train)

# Creating a model schema using 'ModelSchema' with the input and output schemas
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

# Converting the model schema to a dictionary representation
schema_dict = model_schema.to_dict()

In [None]:
# Saving the XGBoost regressor object as a json file in the model directory
xgb_regressor.save_model(model_dir + "/model.json")

In [None]:
res_dict = {
        "MSE": str(mse),
        "R squared": str(r2),
    }

In [None]:
mr = project.get_model_registry()

# Creating a Python model in the model registry named 'air_quality_xgboost_model'

aq_model = mr.python.create_model(
    name="player_score_xgboost_model",
    metrics= res_dict,
    model_schema=model_schema,
    input_example=X_test.sample().values,
    description="Player score predictor",
)

# Saving the model artifacts to the 'air_quality_model' directory in the model registry
aq_model.save(model_dir)