In [None]:
import os
import zipfile

import pandas as pd
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
import xgboost as xgb
from dotenv import load_dotenv
from snowflake.snowpark import Session

load_dotenv(override=True)

connection_params = {
    "account": os.environ.get("SNOWFLAKE_ACCOUNT"),
    "user": os.environ.get("SNOWFLAKE_USER"),
    "password": os.environ.get("SNOWFLAKE_PASSWORD"),
    "role": "SYSADMIN",
    "database": "MARCH_MADNESS",
    "warehouse": "MARCH_MADNESS_WH",
    "schema": "COMMON",
}

session = Session.builder.configs(connection_params).create()

Data Ingestion.

In [None]:
# Check if the file exists, if it doesn't, we'll assume the data has already been loaded. 
if os.path.exists("../data/march-machine-learning-mania-2023.zip"):
    # Open the zip file using the 'with' statement to ensure proper closing
    with zipfile.ZipFile("../data/march-machine-learning-mania-2023.zip") as zf:
        # Iterate over the files in the zip file
        for file in zf.filelist:
            # Check if the file is a CSV file
            if file.filename.endswith(".csv"):
                # Open the CSV file within the zip file
                with zf.open(file.filename) as z:
                    # Read the CSV file into a pandas DataFrame using the ISO-8859-1 encoding
                    df = pd.read_csv(z, encoding="iso-8859-1")
                
                # Create the table name for the Snowflake schema using the CSV file name
                table_name = f"RAW.{file.filename.split('/')[-1].replace('.csv', '').upper()}"
                
                # Convert the column names to uppercase
                df.columns = [col.upper() for col in df.columns]
                
                # Save the pandas DataFrame as a table in Snowflake with the specified table name
                # and overwrite the table if it already exists
                session.create_dataframe(df).write.save_as_table(
                    table_name=table_name, mode="overwrite"
                )

Feature Engineering.

In [None]:
def prepare_season_averages(session: Session, source_table: str, target_table: str) -> str:
    # Assign Snowflake DataFrame to season results table.
    season_results = session.table(source_table)

    # Drop DAYNUM and WLOC as they aren't needed. 
    season_results = season_results.drop("DAYNUM", "WLOC")

    # Drop any columns that start with an L.
    w_season_results = season_results.select(*[col for col in season_results.columns if not col.startswith("L")])
    w_season_results = w_season_results.select([F.col(col).alias(col[1:]) if col.startswith("W") else col for col in w_season_results.columns])

    # Drop any columns that start with an L.
    l_season_results = season_results.select(*[col for col in season_results.columns if not col.startswith("W")])
    l_season_results = l_season_results.select([F.col(col).alias(col[1:]) if col.startswith("L") else col for col in l_season_results.columns])

    # Union these dataframes for the entirety of the seasonal stats. 
    union_season_results = w_season_results.union(l_season_results)

    # Average all columns besides SEASON and TEAMID.
    avg_union_season_results = union_season_results.group_by('SEASON', 'TEAMID').agg(
    *[F.avg(F.col(c)).alias(c) for c in union_season_results.columns if c not in ['SEASON', 'TEAMID']])

    avg_union_season_results.write.save_as_table(target_table, mode="overwrite")

    return f"Successfully created {target_table}."

In [None]:
prepare_season_averages(session, "RAW.MREGULARSEASONDETAILEDRESULTS", "FEATURES.MAVGSEASONDETAILEDRESULTS")

In [None]:
session.sproc.register(
    func=prepare_season_averages,
    return_type=T.StringType(),
    input_types=[T.StringType(), T.StringType()],
    name="PREPARE_SEASON_AVERAGES",
    is_permanent=True,
    stage_location="@COMMON.PYTHON_CODE",
    packages=["snowflake-snowpark-python"],
    replace=True,
    source_code_display=True,
)

Average the Men's regular season detailed stats and the tournament detailed stats. 

In [None]:
session.call("PREPARE_SEASON_AVERAGES", "RAW.MREGULARSEASONDETAILEDRESULTS", "FEATURES.MAVGSEASONDETAILEDRESULTS")

In [None]:
session.call("PREPARE_SEASON_AVERAGES", "RAW.MNCAATOURNEYDETAILEDRESULTS", "FEATURES.MAVGTOURNEYDETAILEDRESULTS")

One-hot encode the conference data. The minimum season in the primary data is 2013, so we'll filter to only look at those years.

In [None]:
def prepare_ohe_conferences(session: Session, source_table: str, target_table: str) -> str:
    # Assign Snowflake DataFrame to Results table.
    conferences_df = session.table(source_table)

    # Filter to 2003 and later seasons.
    conferences_df = conferences_df.filter(F.col("SEASON") >= F.lit(2003))

    # Rename the 'CONFABBREV' column to 'CONFERENCE' and convert the data to a Pandas DataFrame
    conferences_df = conferences_df.with_column_renamed("CONFABBREV", "CONFERENCE").to_pandas()

    # One-hot encode the 'CONFERENCE' column using Pandas get_dummies function
    one_hot_encoded_df = pd.get_dummies(conferences_df, columns=["CONFERENCE"])

    # Convert the Pandas DataFrame back to a Snowflake DataFrame
    one_hot_encoded_df = session.create_dataframe(one_hot_encoded_df)

    # Save the Snowflake DataFrame as a table in Snowflake with the specified table name
    one_hot_encoded_df.write.save_as_table(target_table, mode="overwrite")

    return f"Successfully created {target_table}."

In [None]:
session.sproc.register(
    func=prepare_ohe_conferences,
    return_type=T.StringType(),
    input_types=[T.StringType(), T.StringType()],
    name="PREPARE_OHE_CONFERENCES",
    is_permanent=True,
    stage_location="@COMMON.PYTHON_CODE",
    packages=["pandas", "snowflake-snowpark-python"],
    replace=True,
    source_code_display=True,
)

In [None]:
session.call("PREPARE_OHE_CONFERENCES", "RAW.MTEAMCONFERENCES", "FEATURES.MTEAMCONFERENCESOHE")

Model Preparation

In [None]:
session.table("MTOURNAMENT")