In [1]:
import os
import sys
import zipfile

import cachetools
import joblib
import pandas as pd
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
import xgboost as xgb
from dotenv import load_dotenv
from sklearn.metrics import accuracy_score, classification_report
from snowflake.snowpark import Session

load_dotenv(override=True)

connection_params = {
    "account": os.environ.get("SNOWFLAKE_ACCOUNT"),
    "user": os.environ.get("SNOWFLAKE_USER"),
    "password": os.environ.get("SNOWFLAKE_PASSWORD"),
    "role": "SYSADMIN",
    "database": "MARCH_MADNESS",
    "warehouse": "MARCH_MADNESS_WH",
    "schema": "COMMON",
}

session = Session.builder.configs(connection_params).create()

### Data Ingestion

Load the contents of the CSV files into raw tables.

In [2]:
# Check if the file exists, if it doesn't, we'll assume the data has already been loaded.
if os.path.exists("../data/march-machine-learning-mania-2023.zip"):
    # Open the zip file using the 'with' statement to ensure proper closing
    with zipfile.ZipFile("../data/march-machine-learning-mania-2023.zip") as zf:
        # Iterate over the files in the zip file
        for file in zf.filelist:
            # Check if the file is a CSV file
            if file.filename.endswith(".csv"):
                # Open the CSV file within the zip file
                with zf.open(file.filename) as z:
                    # Read the CSV file into a pandas DataFrame using the ISO-8859-1 encoding
                    df = pd.read_csv(z, encoding="iso-8859-1")

                # Create the table name for the Snowflake schema using the CSV file name
                table_name = (
                    f"RAW.{file.filename.split('/')[-1].replace('.csv', '').upper()}"
                )

                # Convert the column names to uppercase
                df.columns = [col.upper() for col in df.columns]

                # Save the pandas DataFrame as a table in Snowflake with the specified table name
                # and overwrite the table if it already exists
                session.create_dataframe(df).write.save_as_table(
                    table_name=table_name, mode="overwrite"
                )

### Feature Engineering

Create features from the raw data for model preparation.

These will be defined as stored procedures.

In [3]:
def prepare_season_averages(
    session: Session, source_table: str, target_table: str
) -> str:
    # Assign Snowflake DataFrame to season results table.
    season_results = session.table(source_table)

    # Drop DAYNUM and WLOC as they aren't needed.
    season_results = season_results.drop("DAYNUM", "WLOC")

    # Drop any columns that start with an L.
    w_season_results = season_results.select(
        *[col for col in season_results.columns if not col.startswith("L")]
    )
    w_season_results = w_season_results.select(
        [
            F.col(col).alias(col[1:]) if col.startswith("W") else col
            for col in w_season_results.columns
        ]
    )

    # Drop any columns that start with an L.
    l_season_results = season_results.select(
        *[col for col in season_results.columns if not col.startswith("W")]
    )
    l_season_results = l_season_results.select(
        [
            F.col(col).alias(col[1:]) if col.startswith("L") else col
            for col in l_season_results.columns
        ]
    )

    # Union these dataframes for the entirety of the seasonal stats.
    union_season_results = w_season_results.union(l_season_results)

    # Average all columns besides SEASON and TEAMID.
    avg_union_season_results = union_season_results.group_by("SEASON", "TEAMID").agg(
        *[
            F.avg(F.col(c)).alias(f"AVG_{c}")
            for c in union_season_results.columns
            if c not in ["SEASON", "TEAMID"]
        ]
    )

    avg_union_season_results.write.save_as_table(target_table, mode="overwrite")

    return f"Successfully created {target_table}."

We can test the function locally before we register it as a stored procedure.

In [4]:
prepare_season_averages(
    session, "RAW.MREGULARSEASONDETAILEDRESULTS", "FEATURES.MAVGSEASONDETAILEDRESULTS"
)

'Successfully created FEATURES.MAVGSEASONDETAILEDRESULTS.'

Register the stored procedure.

In [5]:
session.sproc.register(
    func=prepare_season_averages,
    return_type=T.StringType(),
    input_types=[T.StringType(), T.StringType()],
    name="PREPARE_SEASON_AVERAGES",
    is_permanent=True,
    stage_location="@COMMON.PYTHON_CODE",
    packages=["snowflake-snowpark-python"],
    replace=True,
    source_code_display=True,
)

The version of package snowflake-snowpark-python in the local environment is 1.3.0, which does not fit the criteria for the requirement snowflake-snowpark-python. Your UDF might not work when the package version is different between the server and your local environment


<snowflake.snowpark.stored_procedure.StoredProcedure at 0x1d801ddfd60>

Call the stored procedure to average the Men's regular season detailed stats. 

In [6]:
session.call(
    "PREPARE_SEASON_AVERAGES",
    "RAW.MREGULARSEASONDETAILEDRESULTS",
    "FEATURES.MAVGSEASONDETAILEDRESULTS",
)

'Successfully created FEATURES.MAVGSEASONDETAILEDRESULTS.'

Define a function to one-hot encode the conference data. The minimum season in the primary data is 2003, so we'll filter to only look at those years.

In [7]:
def prepare_ohe_conferences(
    session: Session, source_table: str, target_table: str
) -> str:
    # Assign Snowflake DataFrame to Results table.
    conferences_df = session.table(source_table)

    # Filter to 2003 and later seasons.
    conferences_df = conferences_df.filter(F.col("SEASON") >= F.lit(2003))

    # Rename the 'CONFABBREV' column to 'CONFERENCE' and convert the data to a Pandas DataFrame
    conferences_df = conferences_df.with_column_renamed(
        "CONFABBREV", "CONFERENCE"
    ).to_pandas()

    # One-hot encode the 'CONFERENCE' column using Pandas get_dummies function
    one_hot_encoded_df = pd.get_dummies(conferences_df, columns=["CONFERENCE"])

    # Uppercase all of these columns.
    one_hot_encoded_df.columns = [col.upper() for col in one_hot_encoded_df.columns]

    # Convert the Pandas DataFrame back to a Snowflake DataFrame
    one_hot_encoded_df = session.create_dataframe(one_hot_encoded_df)

    # Save the Snowflake DataFrame as a table in Snowflake with the specified table name
    one_hot_encoded_df.write.save_as_table(target_table, mode="overwrite")

    return f"Successfully created {target_table}."

Register the stored procedure.

In [8]:
session.sproc.register(
    func=prepare_ohe_conferences,
    return_type=T.StringType(),
    input_types=[T.StringType(), T.StringType()],
    name="PREPARE_OHE_CONFERENCES",
    is_permanent=True,
    stage_location="@COMMON.PYTHON_CODE",
    packages=["pandas", "snowflake-snowpark-python"],
    replace=True,
    source_code_display=True,
)

The version of package snowflake-snowpark-python in the local environment is 1.3.0, which does not fit the criteria for the requirement snowflake-snowpark-python. Your UDF might not work when the package version is different between the server and your local environment


<snowflake.snowpark.stored_procedure.StoredProcedure at 0x1d805bca0d0>

Call the stored procedure to build a table of one-hot encoded conference data.

In [9]:
session.call(
    "PREPARE_OHE_CONFERENCES", "RAW.MTEAMCONFERENCES", "FEATURES.MTEAMCONFERENCESOHE"
)

'Successfully created FEATURES.MTEAMCONFERENCESOHE.'

### Model Preparation

Bringing in the existing tournament results, regular season averages, and conference data.

In [10]:
def prepare_tourney_results(session: Session, target_table: str) -> str:
    tourney_results = (
        session.table("RAW.MNCAATOURNEYCOMPACTRESULTS")
        .filter(F.col("SEASON") >= F.lit(2003))
        .select("SEASON", "WTEAMID", "LTEAMID")
        .with_column_renamed("WTEAMID", "TEAMID_1")
        .with_column_renamed("LTEAMID", "TEAMID_2")
    )

    # Bring in the regular season averages as a Snowpark DataFrame.
    season_df = session.table("FEATURES.MAVGSEASONDETAILEDRESULTS")

    # Bring in the conference one-hot encoded data as a Snowpark DataFrame.
    conferences_df = session.table("FEATURES.MTEAMCONFERENCESOHE")

    # Create a new Snowpark DataFrame for the winning team's regular season averages.
    wteamseasonavgs = season_df.select(
        [
            F.col(c).alias(f"{c}_REGSEASON_1") if c not in ["SEASON", "TEAMID"] else c
            for c in season_df.columns
        ]
    ).with_column_renamed("TEAMID", "TEAMID_1")

    # Create a new Snowpark DataFrame for the losing team's regular season averages.
    lteamseasonavgs = season_df.select(
        [
            F.col(c).alias(f"{c}_REGSEASON_2") if c not in ["SEASON", "TEAMID"] else c
            for c in season_df.columns
        ]
    ).with_column_renamed("TEAMID", "TEAMID_2")

    # Create a new Snowpark DataFrame for the winning team's conference one-hot encoded data.
    wconferences = conferences_df.select(
        [
            F.col(c).alias(f"{c}_1") if c != "SEASON" else c
            for c in conferences_df.columns
        ]
    )

    # Create a new Snowpark DataFrame for the losing team's conference one-hot encoded data.
    lconferences = conferences_df.select(
        [
            F.col(c).alias(f"{c}_2") if c != "SEASON" else c
            for c in conferences_df.columns
        ]
    )

    # Join all of our dataframes together.
    tourney_teams = (
        tourney_results.natural_join(wteamseasonavgs, how="left")
        .natural_join(lteamseasonavgs, how="left")
        .natural_join(wconferences, how="left")
        .natural_join(lconferences, how="left")
    )

    # Create an indicator column for wins and losses.
    wins = tourney_teams.with_column("WIN_INDICATOR", F.lit(1))
    losses = tourney_teams.with_column("WIN_INDICATOR", F.lit(0))

    # Relabel our columns to swap the 1 and 2 suffixes.
    col_relabels = {}
    for col in losses.columns:
        if col.endswith("_1"):
            col_relabels[col] = col.replace("_1", "_2")
        elif col.endswith("_2"):
            col_relabels[col] = col.replace("_2", "_1")

    # Relabel the columns in the losses DataFrame.
    losses = losses.select(
        [
            F.col(c).alias(col_relabels[c]) if c in col_relabels else c
            for c in losses.columns
        ]
    )

    # Union the wins and losses DataFrames together.
    union_df = wins.union_all_by_name(losses)

    # Save the Snowflake DataFrame as a table in Snowflake with the specified table name.
    union_df.write.save_as_table(target_table, mode="overwrite")

Register the stored procedure.

In [11]:
session.sproc.register(
    func=prepare_tourney_results,
    return_type=T.StringType(),
    input_types=[T.StringType()],
    name="PREPARE_TOURNEY_RESULTS",
    is_permanent=True,
    stage_location="@COMMON.PYTHON_CODE",
    packages=["snowflake-snowpark-python"],
    replace=True,
    source_code_display=True,
)

The version of package snowflake-snowpark-python in the local environment is 1.3.0, which does not fit the criteria for the requirement snowflake-snowpark-python. Your UDF might not work when the package version is different between the server and your local environment


<snowflake.snowpark.stored_procedure.StoredProcedure at 0x1d877cbcd30>

Call the stored procedure to build a table for model training.

In [12]:
session.call("PREPARE_TOURNEY_RESULTS", "FEATURES.MFEATURESJOINED")

### Model Training

Bringing in the existing joined features table to train and store the model.

In [13]:
def train_model(session: Session, model_stage: str, model_file_name: str) -> dict:
    # Bring in the data as a Pandas DataFrame.
    union_df = session.table("FEATURES.MFEATURESJOINED").to_pandas()

    # Split the data into training and testing sets.
    train_df = union_df[union_df["SEASON"] < 2020]
    test_df = union_df[union_df["SEASON"] > 2020]

    # Drop the columns we don't want to use in our model.
    X_train = train_df.drop("WIN_INDICATOR", axis=1)

    # Create a new DataFrame with just the WIN_INDICATOR column.
    y_train = train_df["WIN_INDICATOR"]

    # Drop the columns we don't want to use in our model.
    X_test = test_df.drop("WIN_INDICATOR", axis=1)

    # Create a new DataFrame with just the WIN_INDICATOR column.
    y_test = test_df["WIN_INDICATOR"]

    # Train the model.
    model = xgb.XGBClassifier(n_estimators=2000)
    model.fit(X_train, y_train)

    # Make predictions on the test set.
    y_pred = model.predict(X_test)

    # Create the path in the event this doesn't exist. This will only be relevant to executing locally.
    if not os.path.exists("/tmp"):
        os.mkdir("/tmp")

    # Save the model to a file and upload it to the specified stage.
    model_file = os.path.join("/tmp", model_file_name)
    joblib.dump(model, model_file)
    session.file.put(model_file, model_stage, auto_compress=False, overwrite=True)

    return {"Accuracy": accuracy_score(y_test, y_pred)}

We can test this locally and see how well it did.

In [14]:
train_model(session, "@COMMON.MODELS", "xgb_model.pkl")

{'Accuracy': 0.6052631578947368}

We will need to bring down local copies of the data to run the classification report and run the model locally. 

In [15]:
union_df = session.table("FEATURES.MFEATURESJOINED").to_pandas()
train_df = union_df[union_df["SEASON"] < 2020]
test_df = union_df[union_df["SEASON"] > 2020]
X_train = train_df.drop("WIN_INDICATOR", axis=1)
y_train = train_df["WIN_INDICATOR"]
X_test = test_df.drop("WIN_INDICATOR", axis=1)
y_test = test_df["WIN_INDICATOR"]

# Load the model from local storage to test how well it performs.
model = joblib.load("/tmp/xgb_model.pkl")

# Make predictions on the test set.
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.59      0.60       133
           1       0.60      0.62      0.61       133

    accuracy                           0.61       266
   macro avg       0.61      0.61      0.61       266
weighted avg       0.61      0.61      0.61       266



We can see the model is persisted in Snowflake as the function performs a `PUT` operation. 

In [16]:
session.sql("LS @COMMON.MODELS").show()

----------------------------------------------------------------------------------------------------
|"name"                |"size"   |"md5"                             |"last_modified"               |
----------------------------------------------------------------------------------------------------
|models/xgb_model.pkl  |2289888  |6d34bf466d225eaca5c829f254a843df  |Thu, 6 Apr 2023 16:52:29 GMT  |
----------------------------------------------------------------------------------------------------



Register the stored procedure.

In [17]:
session.sproc.register(
    func=train_model,
    return_type=T.StringType(),
    input_types=[T.StringType(), T.StringType()],
    name="TRAIN_MODEL",
    is_permanent=True,
    stage_location="@COMMON.PYTHON_CODE",
    packages=["snowflake-snowpark-python", "xgboost"],
    replace=True,
    source_code_display=True,
)

The version of package snowflake-snowpark-python in the local environment is 1.3.0, which does not fit the criteria for the requirement snowflake-snowpark-python. Your UDF might not work when the package version is different between the server and your local environment
The version of package xgboost in the local environment is 1.7.5, which does not fit the criteria for the requirement xgboost. Your UDF might not work when the package version is different between the server and your local environment


<snowflake.snowpark.stored_procedure.StoredProcedure at 0x1d8001d4f70>

Call the stored procedure to run in Snowflake.

In [18]:
session.call("TRAIN_MODEL", "@COMMON.MODELS", "xgb_model.pkl")

"{'Accuracy': 0.6052631578947368}"

### Model Predictions

Now that our model is trained and persisted in Snowflake, let's see what it predicts for the 2023 tournament. We need to produce every possible game that could be played in the NCAA in 2023, leading us to 65,703 game probabilities. We can look at which teams played in 2023 to build this combination.

First, we will write a helper function to load the existing model in Snowflake. 

In [19]:
@cachetools.cached(cache={})
def load_model(file_name):
    model_file_path = sys._xoptions.get("snowflake_import_directory") + file_name
    return joblib.load(model_file_path)

In [20]:
def infer_model(session: Session, target_table: str) -> str:
    # Bring in the data as a Pandas DataFrame.
    m_2023_teams = (
        session.table("RAW.MREGULARSEASONCOMPACTRESULTS")
        .filter(F.col("SEASON") == 2023)
        .select("WTEAMID", "LTEAMID")
    )

    # Create a DataFrame with all possible combinations of teams.
    m_w_teams = m_2023_teams.select(F.col("WTEAMID").alias("TEAMID"))
    m_l_teams = m_2023_teams.select(F.col("LTEAMID").alias("TEAMID"))
    m_2023_distinct_teams = m_w_teams.union(m_l_teams)
    m_2023_teams_cj = (
        m_2023_distinct_teams.cross_join(m_2023_distinct_teams, lsuffix="_1", rsuffix="_2")
        .filter(F.col("TEAMID_1") < F.col("TEAMID_2"))
        .select(F.lit(2023).alias("SEASON"), "TEAMID_1", "TEAMID_2")
    )

    # Bring in the regular season averages as a Snowpark DataFrame.
    season_df = session.table("FEATURES.MAVGSEASONDETAILEDRESULTS")

    # Bring in the conference one-hot encoded data as a Snowpark DataFrame.
    conferences_df = session.table("FEATURES.MTEAMCONFERENCESOHE")

    # Create a new Snowpark DataFrame for the winning team's regular season averages.
    wteamseasonavgs = season_df.select(
        [
            F.col(c).alias(f"{c}_REGSEASON_1") if c not in ["SEASON", "TEAMID"] else c
            for c in season_df.columns
        ]
    ).with_column_renamed("TEAMID", "TEAMID_1")

    # Create a new Snowpark DataFrame for the losing team's regular season averages.
    lteamseasonavgs = season_df.select(
        [
            F.col(c).alias(f"{c}_REGSEASON_2") if c not in ["SEASON", "TEAMID"] else c
            for c in season_df.columns
        ]
    ).with_column_renamed("TEAMID", "TEAMID_2")

    # Create a new Snowpark DataFrame for the winning team's conference one-hot encoded data.
    wconferences = conferences_df.select(
        [F.col(c).alias(f"{c}_1") if c != "SEASON" else c for c in conferences_df.columns]
    )

    # Create a new Snowpark DataFrame for the losing team's conference one-hot encoded data.
    lconferences = conferences_df.select(
        [F.col(c).alias(f"{c}_2") if c != "SEASON" else c for c in conferences_df.columns]
    )

    # Join all of our dataframes together.
    m_2023_combos = (
        m_2023_teams_cj.natural_join(wteamseasonavgs, how="left")
        .natural_join(lteamseasonavgs, how="left")
        .natural_join(wconferences, how="left")
        .natural_join(lconferences, how="left")
    )

    m_2023_combos_pd = m_2023_combos.to_pandas()

    # Load the model from the specified stage. This is provided via the imports specification on the registration.
    load_model("xgb_model.pkl")

    # Make predictions on the set.
    m_2023_combos_pd["PREDICTION"] = model.predict_proba(m_2023_combos_pd)[:, 1]

    # Create a new DataFrame with only the columns we need.
    submission_prep = m_2023_combos_pd.loc[:, ["SEASON", "TEAMID_1", "TEAMID_2", "PREDICTION"]]

    # Convert the DataFrame to a Snowpark DataFrame.
    submission_prep = session.create_dataframe(submission_prep)

    # Save the predictions to a Snowflake table.
    submission_prep.write.save_as_table(target_table, mode="overwrite")

    return f"Successfully created {target_table}."

In [21]:
session.sproc.register(
    func=infer_model,
    return_type=T.StringType(),
    input_types=[T.StringType()],
    name="INFER_MODEL",
    is_permanent=True,
    stage_location="@COMMON.PYTHON_CODE",
    imports=["@COMMON.MODELS/xgb_model.pkl"],
    packages=["cachetools", "snowflake-snowpark-python", "xgboost"],
    replace=True,
    source_code_display=True,
)

The version of package cachetools in the local environment is 5.3.0, which does not fit the criteria for the requirement cachetools. Your UDF might not work when the package version is different between the server and your local environment
The version of package snowflake-snowpark-python in the local environment is 1.3.0, which does not fit the criteria for the requirement snowflake-snowpark-python. Your UDF might not work when the package version is different between the server and your local environment
The version of package xgboost in the local environment is 1.7.5, which does not fit the criteria for the requirement xgboost. Your UDF might not work when the package version is different between the server and your local environment


<snowflake.snowpark.stored_procedure.StoredProcedure at 0x1d805bcc3a0>

In [22]:
session.call("INFER_MODEL", "FEATURES.M2023PREDICTIONS")

'Successfully created FEATURES.M2023PREDICTIONS.'

Let's see how it did with Connecticut as the example.

In [23]:
session.sql("""
SELECT P.SEASON, 
       M1.TEAMNAME AS TEAMNAME_1,
       M2.TEAMNAME AS TEAMNAME_2,
       ROUND(P.PREDICTION * 100, 4) AS PREDICTION_PCT
FROM MARCH_MADNESS.FEATURES.M2023PREDICTIONS AS P
INNER JOIN RAW.MTEAMS AS M1 ON P.TEAMID_1 = M1.TEAMID
INNER JOIN RAW.MTEAMS AS M2 ON P.TEAMID_2 = M2.TEAMID
WHERE TEAMNAME_1 = 'Connecticut' OR TEAMNAME_2 = 'Connecticut'
""").show()

-------------------------------------------------------------
|"SEASON"  |"TEAMNAME_1"  |"TEAMNAME_2"  |"PREDICTION_PCT"  |
-------------------------------------------------------------
|2023      |Abilene Chr   |Connecticut   |0.0079            |
|2023      |Alabama       |Connecticut   |4.0888            |
|2023      |Arizona       |Connecticut   |0.0777            |
|2023      |Arizona St    |Connecticut   |0.6766            |
|2023      |Arkansas      |Connecticut   |3.4728            |
|2023      |Belmont       |Connecticut   |6.146             |
|2023      |Boston Univ   |Connecticut   |0.0269            |
|2023      |Buffalo       |Connecticut   |0.0003            |
|2023      |Butler        |Connecticut   |4.9312            |
|2023      |BYU           |Connecticut   |0.0002            |
-------------------------------------------------------------

