In [None]:
import pandas as _hex_pandas
import datetime as _hex_datetime
import json as _hex_json

In [None]:
hex_scheduled = _hex_json.loads("false")

In [None]:
hex_user_email = _hex_json.loads("\"example-user@example.com\"")

In [None]:
hex_run_context = _hex_json.loads("\"logic\"")

In [None]:
hex_timezone = _hex_json.loads("\"America/Mexico_City\"")

In [None]:
hex_project_id = _hex_json.loads("\"8d44a9cb-0e91-4841-ad4b-e2df89a64211\"")

In [None]:
hex_project_name = _hex_json.loads("\"3_euro2024_modeling\"")

In [None]:
hex_status = _hex_json.loads("\"\"")

In [None]:
hex_categories = _hex_json.loads("[]")

In [None]:
hex_color_palette = _hex_json.loads("[\"#4C78A8\",\"#F58518\",\"#E45756\",\"#72B7B2\",\"#54A24B\",\"#EECA3B\",\"#B279A2\",\"#FF9DA6\",\"#9D755D\",\"#BAB0AC\"]")

## Model Training and Evaluation
---
- It's time to delve into the ML side! 
- Let's explore how we can train a model using Snowpark ML

---

🛑 **But hold on!** 🛑

Before we get started, make sure you have the following packages adding from the `Packages` drop down:
- `snowflake-ml-python == 1.5.0`
- `fastparquet == 2023.8.0`



In [None]:
import snowflake.snowpark
from snowflake.snowpark.session import Session
from snowflake.snowpark import Window
from snowflake.snowpark import functions as F   
from snowflake.snowpark.functions import udf, udtf
from snowflake.snowpark.types import IntegerType, FloatType, StringType, StructField, StructType, DateType
    
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [None]:
import hextoolkit
hex_snowflake_conn = hextoolkit.get_data_connection('SCS-SIMON-EURO2024')
session = hex_snowflake_conn.get_snowpark_session()

In [None]:
# add version tracking
app_tag = {
    "origin": "sf_sit",
    "name": "hol_sport_predict",
    "version": '{major: 1, minor: 0}'
}

session.query_tag = app_tag

In [None]:
user_name = session.sql('select current_user()').collect()[0][0]

In [None]:
# FUNCTION used to iterate the model version so we can automatically create the next version number

import ast

def get_next_version(reg, model_name) -> str:
    """
    Returns the next version of a model based on the existing versions in the registry.

    Args:
        reg: The registry object that provides access to the models.
        model_name: The name of the model.

    Returns:
        str: The next version of the model in the format "V_<version_number>".

    Raises:
        ValueError: If the version list for the model is empty or if the version format is invalid.
    """
    models = reg.show_models()
    if models.empty:
        return "V_1"
    elif model_name not in models["name"].to_list():
        return "V_1"
    max_version_number = max(
        [
            int(version.split("_")[-1])
            for version in ast.literal_eval(
                models.loc[models["name"] == model_name, "versions"].values[0]
            )
        ]
    )
    return f"V_{max_version_number + 1}"

In [None]:
# check distribution to see how balanced out data set is
# we will also filter out rows where the rank difference is 0, shouldnt be any...

df_training = session.table(f'final_data_{user_name}')

# ignore games where there's no rank difference
df_training = df_training.filter( 
    (F.col('team_1_vs_team_2_rank') != 0) & 
    (F.col('team_1_vs_team_2_rank').is_not_null())
) 

df_training.group_by('game_outcome').agg(F.count('ID')).sort(F.col('game_outcome')).show()

--------------------------------
|"GAME_OUTCOME"  |"COUNT(ID)"  |
--------------------------------
|0               |7157         |
|1               |6806         |
--------------------------------



In [None]:
# we want to do some hyperparameter tuning, in order to speed things up lets size up our warehouse
# note - this is just temporary

session.sql('''
    alter warehouse euro2024_wh set warehouse_size = xxlarge
''').collect()

[Row(status='Statement executed successfully.')]

In [None]:
# and now lets run Hyper Parameter tuning to get the best parameters
# this should take about 1 min 20s, on a 2XL

# hyper parameter grid is 6x6x7, with 5 folds thats 1,260 versions!

from snowflake.ml.modeling.preprocessing import StandardScaler
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.model_selection.grid_search_cv import GridSearchCV

train_data = df_training

FEATURE_COLS = [c for c in train_data.columns if c != "GAME_OUTCOME" and c != "ID"]
LABEL_COLS = ["GAME_OUTCOME"]

hyperparam_grid = {
    "n_estimators": [50, 100, 200, 300, 400, 500],
    "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3, 0.4],
    "max_depth": [3, 4, 5, 6, 7, 8, 9]
}

pipeline = Pipeline(
    steps = [
        (
            "scaler", 
            StandardScaler(
                input_cols=FEATURE_COLS, 
                output_cols=FEATURE_COLS
            )
        ),
        (
        "GridSearchCV",
            GridSearchCV(
                estimator=XGBClassifier(random_state=42),
                param_grid=hyperparam_grid,
                scoring='accuracy', 
                label_cols=LABEL_COLS,
                input_cols=FEATURE_COLS
            )   
        )
    ]
)

pipeline.fit(train_data)

sklearn_hp = pipeline.to_sklearn()
optimal_params = sklearn_hp.steps[-1][1].best_params_
score_dict = {"best_accuracy": sklearn_hp.steps[-1][1].best_score_}

print(score_dict)
print(optimal_params)

{'best_accuracy': 0.7159645686053037}
{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}


In [None]:
# now we can scale it back down, it a matter of seconds

session.sql('''
    alter warehouse euro2024_wh set warehouse_size = xsmall
''').collect()

[Row(status='Statement executed successfully.')]

In [None]:
# taking our optimal parameters we're going to build our model

from snowflake.ml.modeling.preprocessing import StandardScaler
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.metrics import *

train_data, test_data = df_training.random_split(weights=[0.8, 0.2], seed=0)

FEATURE_COLS = [c for c in train_data.columns if c != "GAME_OUTCOME" and c != "ID"]
LABEL_COLS = ["GAME_OUTCOME"]

pipeline = Pipeline(
    steps = [
        (
            "scaler", 
            StandardScaler(
                input_cols=FEATURE_COLS, 
                output_cols=FEATURE_COLS
            )
        ),
        (
            "model", 
            XGBClassifier(
                input_cols=FEATURE_COLS, 
                label_cols=LABEL_COLS,
                max_depth=optimal_params['max_depth'],
                n_estimators = optimal_params['n_estimators'],
                learning_rate = optimal_params['learning_rate']
            )
        )
    ]
)

pipeline.fit(train_data)

# get the model accuracy
predict_on_training_data = pipeline.predict(train_data)
training_accuracy = accuracy_score(df=predict_on_training_data, y_true_col_names=["GAME_OUTCOME"], y_pred_col_names=["OUTPUT_GAME_OUTCOME"])
predict_on_test_data = pipeline.predict(test_data)
eval_accuracy = accuracy_score(df=predict_on_test_data, y_true_col_names=["GAME_OUTCOME"], y_pred_col_names=["OUTPUT_GAME_OUTCOME"])

print(f"Training accuracy: {training_accuracy} \nEval accuracy: {eval_accuracy}")

Training accuracy: 0.717315 
Eval accuracy: 0.720536


## Model Registry
---

- Once the model is ready we'll use it to predict results of group stage.
- Save the model using MLOps Model Registry features.

In [None]:
from snowflake.ml.registry import Registry

reg = Registry(session=session)

model_name = "EURO_24_GAME_PREDICT"
model_version = get_next_version(reg, model_name)

reg.log_model(
    model_name=model_name,
    version_name=model_version,
    model=pipeline,
    metrics={
        'training_accuracy':training_accuracy, 
        'eval_accuracy':eval_accuracy
    },
    options={
        'relax_version': False,
        'embed_local_ml_library': True       
    }
)

m = reg.get_model(model_name)
m.default = model_version

In [None]:
# lets see the models we have in our registry

reg.get_model(model_name).show_versions()

Unnamed: 0,created_on,name,comment,database_name,schema_name,module_name,is_default_version,functions,metadata,user_data
0,2024-05-13 16:42:59.402000-07:00,V_1,,EURO2024,PUBLIC,EURO_24_GAME_PREDICT,False,"[""PREDICT_PROBA"",""PREDICT""]","{""metrics"": {""training_accuracy"": 0.725744, ""e...","{""snowpark_ml_data"":{""functions"":[{""name"":""PRE..."
1,2024-05-13 16:49:10.592000-07:00,V_2,,EURO2024,PUBLIC,EURO_24_GAME_PREDICT,False,"[""PREDICT_PROBA"",""PREDICT""]","{""metrics"": {""training_accuracy"": 0.725133, ""e...","{""snowpark_ml_data"":{""functions"":[{""name"":""PRE..."
2,2024-05-14 03:12:23.868000-07:00,V_3,,EURO2024,PUBLIC,EURO_24_GAME_PREDICT,False,"[""PREDICT_PROBA"",""PREDICT""]","{""metrics"": {""training_accuracy"": 0.724703, ""e...","{""snowpark_ml_data"":{""functions"":[{""name"":""PRE..."
3,2024-05-14 03:17:23.600000-07:00,V_4,,EURO2024,PUBLIC,EURO_24_GAME_PREDICT,False,"[""PREDICT_PROBA"",""PREDICT""]","{""metrics"": {""training_accuracy"": 0.722397, ""e...","{""snowpark_ml_data"":{""functions"":[{""name"":""PRE..."
4,2024-05-21 13:21:28.526000-07:00,V_5,,EURO2024,PUBLIC,EURO_24_GAME_PREDICT,False,"[""PREDICT_PROBA"",""PREDICT""]","{""metrics"": {""training_accuracy"": 0.721912, ""e...","{""snowpark_ml_data"":{""functions"":[{""name"":""PRE..."
5,2024-05-21 16:50:37.339000-07:00,V_6,,EURO2024,PUBLIC,EURO_24_GAME_PREDICT,False,"[""PREDICT_PROBA"",""PREDICT""]","{""metrics"": {""training_accuracy"": 0.720032, ""e...","{""snowpark_ml_data"":{""functions"":[{""name"":""PRE..."
6,2024-05-21 17:29:27.036000-07:00,V_7,,EURO2024,PUBLIC,EURO_24_GAME_PREDICT,False,"[""PREDICT_PROBA"",""PREDICT""]","{""metrics"": {""training_accuracy"": 0.719614, ""e...",{}
7,2024-05-24 05:48:45.993000-07:00,V_8,,EURO2024,PUBLIC,EURO_24_GAME_PREDICT,False,"[""PREDICT_PROBA"",""PREDICT""]","{""metrics"": {""training_accuracy"": 0.71778, ""ev...",{}
8,2024-05-29 07:09:14.830000-07:00,V_9,,EURO2024,PUBLIC,EURO_24_GAME_PREDICT,False,"[""PREDICT_PROBA"",""PREDICT""]","{""metrics"": {""training_accuracy"": 0.723598, ""e...",{}
9,2024-05-29 09:18:35.918000-07:00,V_10,,EURO2024,PUBLIC,EURO_24_GAME_PREDICT,False,"[""PREDICT_PROBA"",""PREDICT""]","{""metrics"": {""training_accuracy"": 0.723504, ""e...",{}


# Summary

We now have a model in our registry we can use to call from either Snowpark or SQL, which we'll use in the predictions notebook