In [11]:
# Setup environment
from pyspark.sql import DataFrame
from pyspark.sql import functions as F

StatementMeta(, d42d45ea-48e9-4b96-a210-9d1f91540c7b, 13, Finished, Available, Finished)

In [12]:
# UDF to for checking null values counts in columns
def count_nulls(df: DataFrame):
    """
    Returns a Spark DataFrame showing the number of nulls per column
    for any input DataFrame.
    """
    null_counts = df.select([
        F.sum(F.col(c).isNull().cast("int")).alias(c)
        for c in df.columns
    ])
    return null_counts
   

StatementMeta(, d42d45ea-48e9-4b96-a210-9d1f91540c7b, 14, Finished, Available, Finished)

In [13]:
# Read in data from delta tables
table_of_interest = "NHL_Lakehouse_Bronze.nhl_game_skater_stats" # Define table of interest
df = spark.read.table(table_of_interest)


StatementMeta(, d42d45ea-48e9-4b96-a210-9d1f91540c7b, 15, Finished, Available, Finished)

In [14]:
# Data exploration
df.show() 
df.printSchema() 
count_nulls(df).show(truncate=False)


StatementMeta(, d42d45ea-48e9-4b96-a210-9d1f91540c7b, 16, Finished, Available, Finished)

+----------+---------+-------+---------+-------+-----+-----+----+--------------+----------------+--------------+-----------+------------+---------+---------+----------------+------------------+-------+---------+-------------+--------------------+------------------+
|   game_id|player_id|team_id|timeonice|assists|goals|shots|hits|powerplaygoals|powerplayassists|penaltyminutes|faceoffwins|faceofftaken|takeaways|giveaways|shorthandedgoals|shorthandedassists|blocked|plusminus|eventimeonice|shorthandedtimeonice|powerplaytimeonice|
+----------+---------+-------+---------+-------+-----+-----+----+--------------+----------------+--------------+-----------+------------+---------+---------+----------------+------------------+-------+---------+-------------+--------------------+------------------+
|2016020045|  8468513|      4|      955|      1|    0|    0|   2|             0|               0|             0|          0|           0|        1|        1|               0|                 0|      1| 

In [15]:
from pyspark.sql.functions import col

df.filter(col("assists") > 3) \
  .select(
      "game_id",
      "team_id",
      "assists",
      "powerplayassists",
      "shorthandedassists"
  ) \
  .show()

df.filter(col("goals") > 3) \
  .select(
      "game_id",
      "team_id",
      "goals",
      "powerplaygoals",
      "shorthandedgoals"
  ) \
  .show()

StatementMeta(, d42d45ea-48e9-4b96-a210-9d1f91540c7b, 17, Finished, Available, Finished)

+----------+-------+-------+----------------+------------------+
|   game_id|team_id|assists|powerplayassists|shorthandedassists|
+----------+-------+-------+----------------+------------------+
|2016020861|      9|      4|               1|                 0|
|2014020242|     29|      4|               3|                 0|
|2014020070|     25|      4|               2|                 0|
|2014020481|      4|      4|               0|                 0|
|2014020980|     12|      4|               1|                 0|
|2014020560|      5|      4|               1|                 0|
|2014020472|     26|      4|               3|                 0|
|2014020699|      5|      5|               2|                 0|
|2014020100|     29|      4|               1|                 0|
|2014020726|     16|      4|               0|                 0|
|2014020059|     17|      4|               0|                 0|
|2014020130|     14|      4|               1|                 0|
|2014020203|      5|     

In [16]:
# Check for duplicate rows
total_rows = df.count()
duplicate_rows_count = df.count() - df.distinct().count()
print(f"Number of rows: {total_rows}")
print(f"Number of duplicate rows: {duplicate_rows_count}")

df_clean = df.dropDuplicates()

total_rows = df_clean.count()
duplicate_rows_count = df_clean.count() - df_clean.distinct().count()
print(f"\nNumber of rows (deduplicated): {total_rows}")
print(f"Number of duplicate rows (deduplicated): {duplicate_rows_count}")

StatementMeta(, d42d45ea-48e9-4b96-a210-9d1f91540c7b, 18, Finished, Available, Finished)

Number of rows: 945830
Number of duplicate rows: 92426

Number of rows (deduplicated): 853404
Number of duplicate rows (deduplicated): 0


In [17]:
# List all columns
print(df_clean.columns)

StatementMeta(, d42d45ea-48e9-4b96-a210-9d1f91540c7b, 19, Finished, Available, Finished)

['game_id', 'player_id', 'team_id', 'timeonice', 'assists', 'goals', 'shots', 'hits', 'powerplaygoals', 'powerplayassists', 'penaltyminutes', 'faceoffwins', 'faceofftaken', 'takeaways', 'giveaways', 'shorthandedgoals', 'shorthandedassists', 'blocked', 'plusminus', 'eventimeonice', 'shorthandedtimeonice', 'powerplaytimeonice']


# NHL Skater Stats Columns
### Column Meaning / Explanation / Highlighted columns refer to aggregated numbers and can be dropped if want to work out at BI stage
**<mark>timeonice</mark>** : Total time the player was on the ice (all situations)  **Can be totaled from breakdown times**  
**eventimeonice** : Time on ice at even strength  
**powerplaytimeonice** :Time on ice during power plays  
**shorthandedtimeonice** : Time on ice while short-handed  
**shots** : Total shots on goal taken  
**goals** : Number of goals scored by the player  (Refers to Even Strength Goals, verified from total goals on game table)  
**powerplaygoals** : Goals scored on the power play  
**shorthandedgoals** : Goals scored while short-handed  
**assists** : Number of assists recorded by the player  (Refers to Even Strength Assists, based on Goal column data)  
**powerplayassists** : Assists recorded on power-play goals  
**shorthandedassists** : Assists recorded while short-handed  
**hits** : Number of body checks delivered  
**blocked** : Number of opponent shots blocked  
**takeaways** : Times the player took possession from an opponent  
**giveaways** : Times the player lost possession to the opponent  
**penaltyminutes** : Total minutes spent serving penalties  
**faceofftaken** : Total faceoffs the player participated in  
**faceoffwins** : Faceoffs won by the player  
**plusminus** : Goal differential while the player was on the ice at even strength

## Analysis notes
How Analysts Use Them Together  
**Hits + Takeaways** : forechecking effectiveness  
**Blocked + TOI** : defensive reliability  
**Takeaways** : Giveaways â†’ puck management quality  


In [18]:
# Filter dataframe for interested columns
col_of_interest = [ 'game_id', 'player_id', 'team_id', 
                    'faceoffwins', 'faceofftaken',  
                    'hits', 'penaltyminutes', 
                    'timeonice','powerplaytimeonice', 'shorthandedtimeonice', 'eventimeonice',
                    'powerplayassists', 'shorthandedassists', 'assists',
                    'shots', 'powerplaygoals', 'shorthandedgoals', 'goals', 
                    'takeaways', 'giveaways', 'blocked', 
                    'plusminus'] # Define columns of interest
df_clean = df_clean[col_of_interest]
df_clean.show()

StatementMeta(, d42d45ea-48e9-4b96-a210-9d1f91540c7b, 20, Finished, Available, Finished)

+----------+---------+-------+-----------+------------+----+--------------+---------+------------------+--------------------+-------------+----------------+------------------+-------+-----+--------------+----------------+-----+---------+---------+-------+---------+
|   game_id|player_id|team_id|faceoffwins|faceofftaken|hits|penaltyminutes|timeonice|powerplaytimeonice|shorthandedtimeonice|eventimeonice|powerplayassists|shorthandedassists|assists|shots|powerplaygoals|shorthandedgoals|goals|takeaways|giveaways|blocked|plusminus|
+----------+---------+-------+-----------+------------+----+--------------+---------+------------------+--------------------+-------------+----------------+------------------+-------+-----+--------------+----------------+-----+---------+---------+-------+---------+
|2017020872|  8470595|     30|          8|          13|   1|             0|     1011|                87|                   8|          916|               0|                 0|      0|    2|             

In [21]:
# Row data aggregation
df_clean = (
    df_clean
    .withColumn(
        "total_assists",
        F.col("powerplayassists")
        + F.col("shorthandedassists")
        + F.col("assists")
    )
    .withColumn(
        "total_goals",
        F.col("powerplaygoals")
        + F.col("shorthandedgoals")
        + F.col("goals")   
    )
)

# Rearrange columns
df_clean = df_clean.select(
    "game_id", "player_id", "team_id",
    "faceoffwins", "faceofftaken",
    "hits", "penaltyminutes",
    "timeonice", "powerplaytimeonice", "shorthandedtimeonice", "eventimeonice",
    "total_assists", "powerplayassists", "shorthandedassists", "assists",
    "shots", "total_goals", "powerplaygoals", "shorthandedgoals", "goals",
    "takeaways", "giveaways", "blocked",
    "plusminus"
)

StatementMeta(, d42d45ea-48e9-4b96-a210-9d1f91540c7b, 23, Finished, Available, Finished)

In [22]:
df_clean.show()


StatementMeta(, d42d45ea-48e9-4b96-a210-9d1f91540c7b, 24, Finished, Available, Finished)

+----------+---------+-------+-----------+------------+----+--------------+---------+------------------+--------------------+-------------+-------------+----------------+------------------+-------+-----+-----------+--------------+----------------+-----+---------+---------+-------+---------+
|   game_id|player_id|team_id|faceoffwins|faceofftaken|hits|penaltyminutes|timeonice|powerplaytimeonice|shorthandedtimeonice|eventimeonice|total_assists|powerplayassists|shorthandedassists|assists|shots|total_goals|powerplaygoals|shorthandedgoals|goals|takeaways|giveaways|blocked|plusminus|
+----------+---------+-------+-----------+------------+----+--------------+---------+------------------+--------------------+-------------+-------------+----------------+------------------+-------+-----+-----------+--------------+----------------+-----+---------+---------+-------+---------+
|2017020872|  8470595|     30|          8|          13|   1|             0|     1011|                87|                   8

In [23]:
# Rename columns 
rename_dict = {
    "faceoffwins"         : "faceoff_wins", 
    "faceofftaken"        : "faceoff_taken",  
    "penaltyminutes"      : "penalty_mins",
    "timeonice"           : "total_time_on_ice",
    "powerplaytimeonice"  : "pp_time_on_ice", 
    "shorthandedtimeonice": "sh_time_on_ice",
    "eventimeonice"       : "es_time_on_ice",
    "powerplayassists"    : "pp_assists", 
    "shorthandedassists"  : "sh_assists", 
    "assists"             : "es_assists", # Assumed to be even strength data due to goals column
    "powerplaygoals"      : "pp_goals", 
    "shorthandedgoals"    : "sh_goals", 
    "goals"               : "es_goals", # Verified to be even strength data 
    "takeaways"           : "take_aways", 
    "giveaways"           : "give_aways" 
}

# Loop over the dictionary and rename columns
for old_name, new_name in rename_dict.items():
    df_clean = df_clean.withColumnRenamed(old_name, new_name)

# Show the updated DataFrame
df_clean.show(5)

StatementMeta(, d42d45ea-48e9-4b96-a210-9d1f91540c7b, 25, Finished, Available, Finished)

+----------+---------+-------+------------+-------------+----+------------+-----------------+--------------+--------------+--------------+-------------+----------+----------+----------+-----+-----------+--------+--------+--------+----------+----------+-------+---------+
|   game_id|player_id|team_id|faceoff_wins|faceoff_taken|hits|penalty_mins|total_time_on_ice|pp_time_on_ice|sh_time_on_ice|es_time_on_ice|total_assists|pp_assists|sh_assists|es_assists|shots|total_goals|pp_goals|sh_goals|es_goals|take_aways|give_aways|blocked|plusminus|
+----------+---------+-------+------------+-------------+----+------------+-----------------+--------------+--------------+--------------+-------------+----------+----------+----------+-----+-----------+--------+--------+--------+----------+----------+-------+---------+
|2017020872|  8470595|     30|           8|           13|   1|           0|             1011|            87|             8|           916|            0|         0|         0|         0|  

In [24]:
col = "sh_assists"
df_clean.agg(
    F.min(col).alias("min"),
    F.max(col).alias("max")
).show()


StatementMeta(, d42d45ea-48e9-4b96-a210-9d1f91540c7b, 26, Finished, Available, Finished)

+---+---+
|min|max|
+---+---+
|  0|  2|
+---+---+



In [25]:
# Cast data to expected data types
silver_player_stats = (
    df_clean
    .withColumn("game_id", F.col("game_id").cast("string"))
    .withColumn("player_id", F.col("player_id").cast("string"))
    .withColumn("team_id", F.col("team_id").cast("string"))

    .withColumn("faceoff_wins", F.col("faceoff_wins").cast("byte"))
    .withColumn("faceoff_taken", F.col("faceoff_taken").cast("byte"))
    .withColumn("hits", F.col("hits").cast("byte"))
    
    .withColumn("penalty_mins", F.col("penalty_mins").cast("byte"))

    .withColumn("total_time_on_ice", F.col("total_time_on_ice").cast("short"))
    .withColumn("pp_time_on_ice", F.col("pp_time_on_ice").cast("short"))
    .withColumn("sh_time_on_ice", F.col("sh_time_on_ice").cast("short"))
    .withColumn("es_time_on_ice", F.col("es_time_on_ice").cast("short"))

    .withColumn("total_assists", F.col("total_assists").cast("byte"))
    .withColumn("pp_assists", F.col("pp_assists").cast("byte"))
    .withColumn("sh_assists", F.col("sh_assists").cast("byte"))
    .withColumn("es_assists", F.col("es_assists").cast("byte"))

    .withColumn("shots", F.col("shots").cast("byte"))

    .withColumn("total_goals", F.col("total_goals").cast("byte"))
    .withColumn("pp_goals", F.col("pp_goals").cast("byte"))
    .withColumn("sh_goals", F.col("sh_goals").cast("byte"))
    .withColumn("es_goals", F.col("es_goals").cast("byte"))

    .withColumn("take_aways", F.col("take_aways").cast("byte"))
    .withColumn("give_aways", F.col("give_aways").cast("byte"))
    .withColumn("blocked", F.col("blocked").cast("byte"))

    .withColumn("plusminus", F.col("plusminus").cast("byte"))
)


StatementMeta(, d42d45ea-48e9-4b96-a210-9d1f91540c7b, 27, Finished, Available, Finished)

In [26]:
silver_player_stats.show()
silver_player_stats.printSchema() 

StatementMeta(, d42d45ea-48e9-4b96-a210-9d1f91540c7b, 28, Finished, Available, Finished)

+----------+---------+-------+------------+-------------+----+------------+-----------------+--------------+--------------+--------------+-------------+----------+----------+----------+-----+-----------+--------+--------+--------+----------+----------+-------+---------+
|   game_id|player_id|team_id|faceoff_wins|faceoff_taken|hits|penalty_mins|total_time_on_ice|pp_time_on_ice|sh_time_on_ice|es_time_on_ice|total_assists|pp_assists|sh_assists|es_assists|shots|total_goals|pp_goals|sh_goals|es_goals|take_aways|give_aways|blocked|plusminus|
+----------+---------+-------+------------+-------------+----+------------+-----------------+--------------+--------------+--------------+-------------+----------+----------+----------+-----+-----------+--------+--------+--------+----------+----------+-------+---------+
|2017020872|  8470595|     30|           8|           13|   1|           0|             1011|            87|             8|           916|            0|         0|         0|         0|  

In [27]:
# Write cleaned data into lakehouse
(silver_player_stats
    .write.format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable('NHL_Lakehouse_Silver.silver_player_stats')
)

StatementMeta(, d42d45ea-48e9-4b96-a210-9d1f91540c7b, 29, Finished, Available, Finished)