In [16]:
# Setup environment
from pyspark.sql import DataFrame
from pyspark.sql import functions as F

StatementMeta(, 2810a481-7d33-49e2-a091-385ef188440a, 18, Finished, Available, Finished)

In [17]:
# UDF to for checking null values counts in columns
def count_nulls(df: DataFrame):
    """
    Returns a Spark DataFrame showing the number of nulls per column
    for any input DataFrame.
    """
    null_counts = df.select([
        F.sum(F.col(c).isNull().cast("int")).alias(c)
        for c in df.columns
    ])
    return null_counts
   

StatementMeta(, 2810a481-7d33-49e2-a091-385ef188440a, 19, Finished, Available, Finished)

In [18]:
# Read in data from delta tables
table_of_interest = "NHL_Lakehouse_Bronze.nhl_game_goalie_stats" # Define table of interest
df = spark.read.table(table_of_interest)


StatementMeta(, 2810a481-7d33-49e2-a091-385ef188440a, 20, Finished, Available, Finished)

In [19]:
# Data exploration
df.show() 
df.printSchema() 
count_nulls(df).show(truncate=False)


StatementMeta(, 2810a481-7d33-49e2-a091-385ef188440a, 21, Finished, Available, Finished)

+----------+---------+-------+---------+-------+-----+---+-----+-----+--------------+----------------+---------+-----------------------+----------------+---------------------+--------+----------------+-----------------------+--------------------------+
|   game_id|player_id|team_id|timeonice|assists|goals|pim|shots|saves|powerplaysaves|shorthandedsaves|evensaves|shorthandedshotsagainst|evenshotsagainst|powerplayshotsagainst|decision|  savepercentage|powerplaysavepercentage|evenstrengthsavepercentage|
+----------+---------+-------+---------+-------+-----+---+-----+-----+--------------+----------------+---------+-----------------------+----------------+---------------------+--------+----------------+-----------------------+--------------------------+
|2016020045|  8473607|      4|     1504|      0|    0|  0|   16|   12|             1|               0|       11|                      0|              13|                    3|    NULL|              75|       33.3333333333333|          84.615

# <u>Duplicate verification</u>
#### Can run this to find the duplicated rows

duplicate_rows = df.groupBy(df.columns)\
                   .agg(F.count("*").alias("count"))\
                   .filter(F.col("count") > 1)

duplicate_rows.show(truncate=False)

#### After showing the duplicated rows, can verify duplication with below code
df[df['play_id']=="2019020002_248"].show()

In [20]:
# Check for duplicate rows
total_rows = df.count()
duplicate_rows_count = df.count() - df.distinct().count()
print(f"Number of rows: {total_rows}")
print(f"Number of duplicate rows: {duplicate_rows_count}")

df_clean = df.dropDuplicates()

total_rows = df_clean.count()
duplicate_rows_count = df_clean.count() - df_clean.distinct().count()
print(f"\nNumber of rows (deduplicated): {total_rows}")
print(f"Number of duplicate rows (deduplicated): {duplicate_rows_count}")

StatementMeta(, 2810a481-7d33-49e2-a091-385ef188440a, 22, Finished, Available, Finished)

Number of rows: 56656
Number of duplicate rows: 5493

Number of rows (deduplicated): 51163
Number of duplicate rows (deduplicated): 0


In [21]:
# List all columns
print(df_clean.columns)

StatementMeta(, 2810a481-7d33-49e2-a091-385ef188440a, 23, Finished, Available, Finished)

['game_id', 'player_id', 'team_id', 'timeonice', 'assists', 'goals', 'pim', 'shots', 'saves', 'powerplaysaves', 'shorthandedsaves', 'evensaves', 'shorthandedshotsagainst', 'evenshotsagainst', 'powerplayshotsagainst', 'decision', 'savepercentage', 'powerplaysavepercentage', 'evenstrengthsavepercentage']


In [22]:
df_clean.show()

StatementMeta(, 2810a481-7d33-49e2-a091-385ef188440a, 24, Finished, Available, Finished)

+----------+---------+-------+---------+-------+-----+---+-----+-----+--------------+----------------+---------+-----------------------+----------------+---------------------+--------+----------------+-----------------------+--------------------------+
|   game_id|player_id|team_id|timeonice|assists|goals|pim|shots|saves|powerplaysaves|shorthandedsaves|evensaves|shorthandedshotsagainst|evenshotsagainst|powerplayshotsagainst|decision|  savepercentage|powerplaysavepercentage|evenstrengthsavepercentage|
+----------+---------+-------+---------+-------+-----+---+-----+-----+--------------+----------------+---------+-----------------------+----------------+---------------------+--------+----------------+-----------------------+--------------------------+
|2016020119|  8470140|     25|     3600|      0|    0|  0|   15|   11|             1|               0|       10|                      1|              12|                    2|       L|73.3333333333333|                     50|          83.333

# NHL Goalie Stats Columns
### Column Meaning / Explanation / Highlighted columns refer to aggregated numbers and can be dropped if want to work out at BI stage
**timeonice** : Total time the goalie was on ice in seconds or minutes, depending on dataset.  
**assists** : Number of assists the goalie recorded (rare, but sometimes they pass the puck leading to a goal).  
**goals** : Number of goals the goalie scored (extremely rare). Usually 0.  
**pim** : Penalty minutes incurred by the goalie. Usually low, but can happen.  
**<mark>shots</mark>** :	Number of shots the goalie faced during the game. <u>_**Total from all types of shots**_</u>  
**<mark>saves</mark>** :	Number of shots the goalie successfully stopped. <u>_**Total from all types of saves**_</u>  
**powerplaysaves** : Saves the goalie made while their team was shorthanded (opponent on a power play).  
**shorthandedsaves** : Saves the goalie made while their team had a man advantage (power play) — rare. Sometimes same as powerplaysaves.  
**evensaves** : Saves made during even-strength play (5-on-5).  
**shorthandedshotsagainst** : Shots against the goalie while their team was shorthanded (opponent on a power play).  
**evenshotsagainst** :	Shots against during even-strength play.  
**powerplayshotsagainst** :	Shots against while the goalie’s team was shorthanded (opponent had a power play).  
**<mark>decision</mark>** : Outcome of the game for the goalie: "W" = win, "L" = loss, "OT" = overtime loss, "SO" = shootout loss, etc.  
**<mark>savepercentage</mark>** : Overall save percentage = saves / shots against. Measures effectiveness. <u>_**Total Saves / Total Shots**_</u>   
**<mark>powerplaysavepercentage</mark>** : Save percentage while shorthanded (facing opponent power plays). <u>_**Total Powerplay Saves / Total Powerplay Shots**_</u>  
**<mark>evenstrengthsavepercentage</mark>** : Save percentage during even-strength play (5-on-5). <u>_**Total Evenstrength Saves / Total Evenstrength Shots**_</u> 

### Missing Stats
**<mark>shorthandedsavepercentage</mark>** : Save percentage during shorthanded play (5-on-5). <u>_**Total Shorthanded Saves / Total Shorthanded Shots**_</u> 


In [23]:
# Filter dataframe for interested columns
col_of_interest = [
    'game_id','player_id',
    'team_id','timeonice', 
    'assists', 'goals', 'pim', 
    'saves', 'powerplaysaves', 'shorthandedsaves', 'evensaves', 
    'shots','powerplayshotsagainst','shorthandedshotsagainst', 'evenshotsagainst',
    'savepercentage', 'powerplaysavepercentage', 'evenstrengthsavepercentage'] # Define columns of interest
df_clean = df_clean[col_of_interest]
df_clean.show()

StatementMeta(, 2810a481-7d33-49e2-a091-385ef188440a, 25, Finished, Available, Finished)

+----------+---------+-------+---------+-------+-----+---+-----+--------------+----------------+---------+-----+---------------------+-----------------------+----------------+----------------+-----------------------+--------------------------+
|   game_id|player_id|team_id|timeonice|assists|goals|pim|saves|powerplaysaves|shorthandedsaves|evensaves|shots|powerplayshotsagainst|shorthandedshotsagainst|evenshotsagainst|  savepercentage|powerplaysavepercentage|evenstrengthsavepercentage|
+----------+---------+-------+---------+-------+-----+---+-----+--------------+----------------+---------+-----+---------------------+-----------------------+----------------+----------------+-----------------------+--------------------------+
|2016020119|  8470140|     25|     3600|      0|    0|  0|   11|             1|               0|       10|   15|                    2|                      1|              12|73.3333333333333|                     50|          83.3333333333333|
|2017021170|  8475831|  

In [24]:
# Missing pct column
df_clean = (
    df_clean
    .withColumn(
        "sh_save_pct",
        F.col("shorthandedsaves").cast("double")
        / F.col("shorthandedshotsagainst").cast("double")
        * 100
    )
)

StatementMeta(, 2810a481-7d33-49e2-a091-385ef188440a, 26, Finished, Available, Finished)

In [30]:
# Rename columns 
rename_dict = {
    "timeonice"              : "time_on_ice",
    "saves"                  : "total_saves",
    "powerplaysaves"         : "pp_saves",
    "shorthandedsaves"       : "sh_saves",
    "evensaves"              : "es_saves",
    "shots"                  : "total_shots_against",
    "powerplayshotsagainst"  : "pp_shots_against",
    "shorthandedshotsagainst": "sh_shots_against",
    "evenshotsagainst"       : "es_shots_against",
    "savepercentage"         : "total_save_pct",
    "powerplaysavepercentage": "pp_save_pct",
    "evenstrengthsavepercentage" : "es_save_pct"
}

# Loop over the dictionary and rename columns
for old_name, new_name in rename_dict.items():
    df_clean = df_clean.withColumnRenamed(old_name, new_name)

# Show the updated DataFrame
df_clean.show(5)

StatementMeta(, 2810a481-7d33-49e2-a091-385ef188440a, 32, Finished, Available, Finished)

+----------+---------+-------+-----------+-------+-----+---+-----------+--------+--------+--------+-------------------+----------------+----------------+----------------+----------------+----------------+----------------+-----------+
|   game_id|player_id|team_id|time_on_ice|assists|goals|pim|total_saves|pp_saves|sh_saves|es_saves|total_shots_against|pp_shots_against|sh_shots_against|es_shots_against|  total_save_pct|     pp_save_pct|     es_save_pct|sh_save_pct|
+----------+---------+-------+-----------+-------+-----+---+-----------+--------+--------+--------+-------------------+----------------+----------------+----------------+----------------+----------------+----------------+-----------+
|2016020119|  8470140|     25|       3600|      0|    0|  0|         11|       1|       0|      10|                 15|               2|               1|              12|73.3333333333333|              50|83.3333333333333|        0.0|
|2017021170|  8475831|     15|       3574|      0|    0|  0|    

In [32]:
silver_goalie_stats = (
    df_clean
    .withColumn("game_id", F.col("game_id").cast("string"))
    .withColumn("player_id", F.col("player_id").cast("string"))
    .withColumn("team_id", F.col("team_id").cast("string"))
    .withColumn("time_on_ice", F.col("time_on_ice").cast("short")) 
    .withColumn("assists", F.col("assists").cast("byte"))
    .withColumn("goals", F.col("goals").cast("byte"))
    .withColumn("pim", F.col("pim").cast("byte"))
    .withColumn("total_saves", F.col("total_saves").cast("byte"))
    .withColumn("pp_saves", F.col("pp_saves").cast("byte"))
    .withColumn("sh_saves", F.col("sh_saves").cast("byte"))
    .withColumn("es_saves", F.col("es_saves").cast("byte"))
    .withColumn("total_shots_against", F.col("total_shots_against").cast("byte"))
    .withColumn("pp_shots_against", F.col("pp_shots_against").cast("byte"))
    .withColumn("sh_shots_against", F.col("sh_shots_against").cast("byte"))
    .withColumn("es_shots_against", F.col("es_shots_against").cast("byte"))
    .withColumn("total_save_pct", F.col("total_save_pct").cast("double"))
    .withColumn("pp_save_pct", F.col("pp_save_pct").cast("double"))
    .withColumn("es_save_pct", F.col("es_save_pct").cast("double"))
    .withColumn("sh_save_pct", F.col("sh_save_pct").cast("double"))
)

StatementMeta(, 2810a481-7d33-49e2-a091-385ef188440a, 34, Finished, Available, Finished)

In [33]:
silver_goalie_stats.show()
silver_goalie_stats.printSchema() 

StatementMeta(, 2810a481-7d33-49e2-a091-385ef188440a, 35, Finished, Available, Finished)

+----------+---------+-------+-----------+-------+-----+---+-----------+--------+--------+--------+-------------------+----------------+----------------+----------------+----------------+----------------+----------------+-----------+
|   game_id|player_id|team_id|time_on_ice|assists|goals|pim|total_saves|pp_saves|sh_saves|es_saves|total_shots_against|pp_shots_against|sh_shots_against|es_shots_against|  total_save_pct|     pp_save_pct|     es_save_pct|sh_save_pct|
+----------+---------+-------+-----------+-------+-----+---+-----------+--------+--------+--------+-------------------+----------------+----------------+----------------+----------------+----------------+----------------+-----------+
|2016020119|  8470140|     25|       3600|      0|    0|  0|         11|       1|       0|      10|                 15|               2|               1|              12|73.3333333333333|            50.0|83.3333333333333|        0.0|
|2017021170|  8475831|     15|       3574|      0|    0|  0|    

In [34]:
# Write cleaned data into lakehouse
(silver_goalie_stats
    .write.format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable('NHL_Lakehouse_Silver.silver_goalie_stats')
)

StatementMeta(, 2810a481-7d33-49e2-a091-385ef188440a, 36, Finished, Available, Finished)