In [1]:
# Setup environment
from pyspark.sql import DataFrame
from pyspark.sql import functions as F

StatementMeta(, 0a271b91-ef5d-437d-a917-52277deb6ac0, 3, Finished, Available, Finished)

In [2]:
# UDF to for checking null values counts in columns
def count_nulls(df: DataFrame):
    """
    Returns a Spark DataFrame showing the number of nulls per column
    for any input DataFrame.
    """
    null_counts = df.select([
        F.sum(F.col(c).isNull().cast("int")).alias(c)
        for c in df.columns
    ])
    return null_counts
   

StatementMeta(, 0a271b91-ef5d-437d-a917-52277deb6ac0, 4, Finished, Available, Finished)

In [3]:
# Read in data from delta tables
table_of_interest = "NHL_Lakehouse_Bronze.nhl_game_plays_players" # Define table of interest
df = spark.read.table(table_of_interest)


StatementMeta(, 0a271b91-ef5d-437d-a917-52277deb6ac0, 5, Finished, Available, Finished)

In [4]:
# Data exploration
df.show() 
df.printSchema() 
count_nulls(df).show(truncate=False)


StatementMeta(, 0a271b91-ef5d-437d-a917-52277deb6ac0, 6, Finished, Available, Finished)

+-------------+----------+---------+----------+
|      play_id|   game_id|player_id|playertype|
+-------------+----------+---------+----------+
| 2016020045_4|2016020045|  8473604|    Winner|
| 2016020045_4|2016020045|  8473512|     Loser|
| 2016020045_5|2016020045|  8473573|   Shooter|
| 2016020045_5|2016020045|  8473607|    Goalie|
| 2016020045_6|2016020045|  8474141|    Scorer|
| 2016020045_6|2016020045|  8473573|    Assist|
| 2016020045_6|2016020045|  8470607|    Assist|
| 2016020045_6|2016020045|  8473607|    Goalie|
| 2016020045_7|2016020045|  8477930|    Winner|
| 2016020045_7|2016020045|  8477951|     Loser|
| 2016020045_8|2016020045|  8474668|   Shooter|
| 2016020045_8|2016020045|  8470645|    Goalie|
| 2016020045_9|2016020045|  8477951|  PlayerID|
|2016020045_10|2016020045|  8476994|    Hitter|
|2016020045_10|2016020045|  8475430|    Hittee|
|2016020045_11|2016020045|  8479482|    Hitter|
|2016020045_11|2016020045|  8479648|    Hittee|
|2016020045_12|2016020045|  8470281|   S

# <u>Duplicate verification</u>
#### Can run this to find the duplicated rows

duplicate_rows = df.groupBy(df.columns)\
                   .agg(F.count("*").alias("count"))\
                   .filter(F.col("count") > 1)

duplicate_rows.show(truncate=False)

#### After showing the duplicated rows, can verify duplication with below code
df[df['play_id']=="2019020002_248"].show()

In [5]:
# Check for duplicate rows
total_rows = df.count()
duplicate_rows_count = df.count() - df.distinct().count()
print(f"Number of rows: {total_rows}")
print(f"Number of duplicate rows: {duplicate_rows_count}")

df_clean = df.dropDuplicates()

total_rows = df_clean.count()
duplicate_rows_count = df_clean.count() - df_clean.distinct().count()
print(f"\nNumber of rows (deduplicated): {total_rows}")
print(f"Number of duplicate rows (deduplicated): {duplicate_rows_count}")

StatementMeta(, 0a271b91-ef5d-437d-a917-52277deb6ac0, 7, Finished, Available, Finished)

Number of rows: 7586604
Number of duplicate rows: 1223800

Number of rows (deduplicated): 6362804
Number of duplicate rows (deduplicated): 0


In [6]:
# List all columns
df_clean.columns

StatementMeta(, 0a271b91-ef5d-437d-a917-52277deb6ac0, 8, Finished, Available, Finished)

['play_id', 'game_id', 'player_id', 'playertype']

In [7]:
# Filter dataframe for interested columns
col_of_interest = ['play_id', 'player_id', 'playertype'] # Define columns of interest
df_clean = df_clean[col_of_interest]
df_clean.show()

StatementMeta(, 0a271b91-ef5d-437d-a917-52277deb6ac0, 9, Finished, Available, Finished)

+--------------+---------+----------+
|       play_id|player_id|playertype|
+--------------+---------+----------+
|  2016020045_9|  8477951|  PlayerID|
| 2016020045_49|  8477290|    Hitter|
|2016020045_155|  8473512|     Loser|
|2017020812_222|  8475164|    Hitter|
|2017020812_231|  8475185|   Shooter|
|2017020812_307|  8475158|    Winner|
| 2015020314_76|  8473575|    Goalie|
|2015020314_147|  8471262|   Blocker|
|2015020314_164|  8474574|    Hittee|
|2015020849_255|  8476460|     Loser|
| 2017020586_39|  8469608|    Goalie|
| 2016020610_48|  8476880|    Hittee|
| 2016020610_52|  8471338|    Winner|
| 2016020610_58|  8474651|    Goalie|
|2015020606_184|  8471734|    Goalie|
|2017020240_194|  8475780|     Loser|
| 2017020624_29|  8473541|    Goalie|
| 2017020624_69|  8477492|    Winner|
| 2015020346_36|  8477934|    Winner|
|2015020346_123|  8464989|    DrewBy|
+--------------+---------+----------+
only showing top 20 rows



In [8]:
# Rename columns 
rename_dict = {
    "play_id"   : "event_id",
}

# Loop over the dictionary and rename columns
for old_name, new_name in rename_dict.items():
    df_clean = df_clean.withColumnRenamed(old_name, new_name)

# Show the updated DataFrame
df_clean.show(5)

StatementMeta(, 0a271b91-ef5d-437d-a917-52277deb6ac0, 10, Finished, Available, Finished)

+--------------+---------+----------+
|      event_id|player_id|playertype|
+--------------+---------+----------+
|  2016020045_9|  8477951|  PlayerID|
| 2016020045_49|  8477290|    Hitter|
|2016020045_155|  8473512|     Loser|
|2017020812_222|  8475164|    Hitter|
|2017020812_231|  8475185|   Shooter|
+--------------+---------+----------+
only showing top 5 rows



In [9]:
# To view the distinct values in each column
column_name = 'playertype'
df_clean.select(column_name).distinct().count()
df_clean.select(column_name).distinct().orderBy(column_name).show()

StatementMeta(, 0a271b91-ef5d-437d-a917-52277deb6ac0, 11, Finished, Available, Finished)

+----------+
|playertype|
+----------+
|    Assist|
|   Blocker|
|    DrewBy|
|    Goalie|
|    Hittee|
|    Hitter|
|     Loser|
| PenaltyOn|
|  PlayerID|
|    Scorer|
|  ServedBy|
|   Shooter|
|   Unknown|
|    Winner|
+----------+



In [10]:
df_clean[df_clean['playertype']=='PlayerID'].count()

StatementMeta(, 0a271b91-ef5d-437d-a917-52277deb6ac0, 12, Finished, Available, Finished)

393113

In [11]:
# Define the schema and data types
silver_event_players = (
    df_clean
    .withColumn("event_id", F.col("event_id").cast("string"))
    .withColumn("player_id", F.col("player_id").cast("string"))
    .withColumn("playertype", F.col("playertype").cast("string"))
)


StatementMeta(, 0a271b91-ef5d-437d-a917-52277deb6ac0, 13, Finished, Available, Finished)

In [13]:
# Write cleaned data into silver lakehouse
( silver_event_players
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable('NHL_Lakehouse_Silver.silver_event_players')
)

StatementMeta(, 0a271b91-ef5d-437d-a917-52277deb6ac0, 15, Finished, Available, Finished)