In [None]:
import os
import zipfile

import pandas as pd
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
import xgboost as xgb
from dotenv import load_dotenv
from snowflake.snowpark import Session

load_dotenv(override=True)

connection_params = {
    "account": os.environ.get("SNOWFLAKE_ACCOUNT"),
    "user": os.environ.get("SNOWFLAKE_USER"),
    "password": os.environ.get("SNOWFLAKE_PASSWORD"),
    "role": "SYSADMIN",
    "database": "MARCH_MADNESS",
    "warehouse": "MARCH_MADNESS_WH",
    "schema": "COMMON",
}

session = Session.builder.configs(connection_params).create()

Data Ingestion.

In [None]:
# Open the zip file using the 'with' statement to ensure proper closing
with zipfile.ZipFile("../data/march-machine-learning-mania-2023.zip") as zf:
    # Iterate over the files in the zip file
    for file in zf.filelist:
        # Check if the file is a CSV file
        if file.filename.endswith(".csv"):
            # Open the CSV file within the zip file
            with zf.open(file.filename) as z:
                # Read the CSV file into a pandas DataFrame using the ISO-8859-1 encoding
                df = pd.read_csv(z, encoding="iso-8859-1")
            
            # Create the table name for the Snowflake schema using the CSV file name
            table_name = f"RAW.{file.filename.split('/')[-1].replace('.csv', '').upper()}"
            
            # Convert the column names to uppercase
            df.columns = [col.upper() for col in df.columns]
            
            # Save the pandas DataFrame as a table in Snowflake with the specified table name
            # and overwrite the table if it already exists
            session.create_dataframe(df).write.save_as_table(
                table_name=table_name, mode="overwrite"
            )

Feature Engineering.

In [None]:
def prepare_season_averages(session: Session, source_table: str, target_table: str) -> str:
    # Assign Snowflake DataFrame to Men's Season Results table.
    mens_season_results = session.table(source_table)

    # Drop DAYNUM and WLOC as they aren't needed. 
    mens_season_results = mens_season_results.drop("DAYNUM", "WLOC")

    # Drop any columns that start with an L.
    w_mens_season_results = mens_season_results.select(*[col for col in mens_season_results.columns if not col.startswith("L")])
    w_mens_season_results = w_mens_season_results.select([F.col(col).alias(col[1:]) if col.startswith("W") else col for col in w_mens_season_results.columns])

    # Drop any columns that start with an L.
    l_mens_season_results = mens_season_results.select(*[col for col in mens_season_results.columns if not col.startswith("W")])
    l_mens_season_results = l_mens_season_results.select([F.col(col).alias(col[1:]) if col.startswith("L") else col for col in l_mens_season_results.columns])

    # Union these dataframes for the entirety of the seasonal stats. 
    union_mens_season_results = w_mens_season_results.union(l_mens_season_results)

    # Average all columns besides SEASON and TEAMID.
    avg_union_mens_season_results = union_mens_season_results.group_by('SEASON', 'TEAMID').agg(
    *[F.avg(F.col(c)).alias(c) for c in union_mens_season_results.columns if c not in ['SEASON', 'TEAMID']])

    avg_union_mens_season_results.write.save_as_table(target_table, mode="overwrite")

    return f"Successfully created {target_table}."

In [None]:
prepare_season_averages(session, "RAW.MREGULARSEASONDETAILEDRESULTS", "FEATURES.MAVGSEASONDETAILEDRESULTS")

In [None]:
session.sproc.register(
    func=prepare_season_averages,
    return_type=T.StringType(),
    input_types=[T.StringType(), T.StringType()],
    name="PREPARE_SEASON_AVERAGES",
    is_permanent=True,
    stage_location="@COMMON.PYTHON_CODE",
    packages=["snowflake-snowpark-python"],
    replace=True,
    source_code_display=True,
)

Average the Men's regular season detailed stats and the tournament detailed stats. 

In [None]:
session.call("PREPARE_SEASON_AVERAGES", "RAW.MREGULARSEASONDETAILEDRESULTS", "FEATURES.MAVGSEASONDETAILEDRESULTS")

In [None]:
session.call("PREPARE_SEASON_AVERAGES", "RAW.MNCAATOURNEYDETAILEDRESULTS", "FEATURES.MAVGTOURNEYDETAILEDRESULTS")