## Model Training and Evaluation
---
- It's time to delve into the ML side! 
- Let's explore how we can train a model using Snowpark ML

---

🛑 **But hold on!** 🛑

Before we get started, make sure you have the following packages adding from the `Packages` drop down:
- `snowflake-ml-python == 1.5.0`
- `fastparquet == 2023.8.0`



In [None]:
import snowflake.snowpark
from snowflake.snowpark.session import Session
from snowflake.snowpark import Window
from snowflake.snowpark import functions as F   
from snowflake.snowpark.functions import udf, udtf
from snowflake.snowpark.types import IntegerType, FloatType, StringType, StructField, StructType, DateType
    
import pandas as pd
import numpy as np
import streamlit as st

import warnings
warnings.filterwarnings('ignore')

In [None]:
from snowflake.snowpark.context import get_active_session
session = get_active_session()

# add version tracking
app_tag = {
    "origin": "sf_sit",
    "name": "hol_sport_predict",
    "version": '{major: 1, minor: 0}'
}

session.query_tag = app_tag

In [None]:
user_name = session.sql('select current_user()').collect()[0][0]

In [None]:
# FUNCTION used to iterate the model version so we can automatically create the next version number

import ast

def get_next_version(reg, model_name) -> str:
    """
    Returns the next version of a model based on the existing versions in the registry.

    Args:
        reg: The registry object that provides access to the models.
        model_name: The name of the model.

    Returns:
        str: The next version of the model in the format "V_<version_number>".

    Raises:
        ValueError: If the version list for the model is empty or if the version format is invalid.
    """
    models = reg.show_models()
    if models.empty:
        return "V_1"
    elif model_name not in models["name"].to_list():
        return "V_1"
    max_version_number = max(
        [
            int(version.split("_")[-1])
            for version in ast.literal_eval(
                models.loc[models["name"] == model_name, "versions"].values[0]
            )
        ]
    )
    return f"V_{max_version_number + 1}"

In [None]:
# check distribution to see how balanced out data set is
# we will also filter out rows where the rank difference is 0, shouldnt be any...

df_training = session.table(f'final_data_{user_name}')

# ignore games where there's no rank difference
df_training = df_training.filter( 
    (F.col('team_1_vs_team_2_rank') != 0) & 
    (F.col('team_1_vs_team_2_rank').is_not_null())
) 

df_training.group_by('game_outcome').agg(F.count('ID')).sort(F.col('game_outcome'))

In [None]:
-- we want to do some hyperparameter tuning, in order to speed things up lets size up our warehouse
-- note - this is just temporary

alter warehouse euro2024_wh set warehouse_size = xxlarge

In [None]:
# and now lets run Hyper Parameter tuning to get the best parameters
# this should take about 1 min 20s, on a 2XL

# hyper parameter grid is 6x6x7, with 5 folds thats 1,260 versions!

from snowflake.ml.modeling.preprocessing import StandardScaler
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.model_selection.grid_search_cv import GridSearchCV

train_data = df_training

FEATURE_COLS = [c for c in train_data.columns if c != "GAME_OUTCOME" and c != "ID"]
LABEL_COLS = ["GAME_OUTCOME"]

hyperparam_grid = {
    "n_estimators": [50, 100, 200, 300, 400, 500],
    "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3, 0.4],
    "max_depth": [3, 4, 5, 6, 7, 8, 9]
}

pipeline = Pipeline(
    steps = [
        (
            "scaler", 
            StandardScaler(
                input_cols=FEATURE_COLS, 
                output_cols=FEATURE_COLS
            )
        ),
        (
        "GridSearchCV",
            GridSearchCV(
                estimator=XGBClassifier(random_state=42),
                param_grid=hyperparam_grid,
                scoring='accuracy', 
                label_cols=LABEL_COLS,
                input_cols=FEATURE_COLS
            )   
        )
    ]
)

pipeline.fit(train_data)

sklearn_hp = pipeline.to_sklearn()
optimal_params = sklearn_hp.steps[-1][1].best_params_
score_dict = {"best_accuracy": sklearn_hp.steps[-1][1].best_score_}

st.write(score_dict)
st.write(optimal_params)

In [None]:
-- now we can scale it back down, it a matter of seconds

alter warehouse euro2024_wh set warehouse_size = xsmall

In [None]:
# taking our optimal parameters we're going to build our model

from snowflake.ml.modeling.preprocessing import StandardScaler
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.metrics import *

train_data, test_data = df_training.random_split(weights=[0.8, 0.2], seed=0)

FEATURE_COLS = [c for c in train_data.columns if c != "GAME_OUTCOME" and c != "ID"]
LABEL_COLS = ["GAME_OUTCOME"]

pipeline = Pipeline(
    steps = [
        (
            "scaler", 
            StandardScaler(
                input_cols=FEATURE_COLS, 
                output_cols=FEATURE_COLS
            )
        ),
        (
            "model", 
            XGBClassifier(
                input_cols=FEATURE_COLS, 
                label_cols=LABEL_COLS,
                max_depth=optimal_params['max_depth'],
                n_estimators = optimal_params['n_estimators'],
                learning_rate = optimal_params['learning_rate']
            )
        )
    ]
)

pipeline.fit(train_data)

# get the model accuracy
predict_on_training_data = pipeline.predict(train_data)
training_accuracy = accuracy_score(df=predict_on_training_data, y_true_col_names=["GAME_OUTCOME"], y_pred_col_names=["OUTPUT_GAME_OUTCOME"])
predict_on_test_data = pipeline.predict(test_data)
eval_accuracy = accuracy_score(df=predict_on_test_data, y_true_col_names=["GAME_OUTCOME"], y_pred_col_names=["OUTPUT_GAME_OUTCOME"])

st.write(f"Training accuracy: {training_accuracy} \nEval accuracy: {eval_accuracy}")

## Model Registry
---

- Once the model is ready we'll use it to predict results of group stage.
- Save the model using MLOps Model Registry features.

In [None]:
from snowflake.ml.registry import Registry

reg = Registry(session=session)

model_name = "EURO_24_GAME_PREDICT"
model_version = get_next_version(reg, model_name)

reg.log_model(
    model_name=model_name,
    version_name=model_version,
    model=pipeline,
    metrics={'training_accuracy':training_accuracy, 'eval_accuracy':eval_accuracy},
    options={'relax_version': False}
)

m = reg.get_model(model_name)
m.default = model_version

In [None]:
# lets see the models we have in our registry

reg.get_model(model_name).show_versions()

# Summary

We now have a model in our registry we can use to call from either Snowpark or SQL, which we'll use in the predictions notebook