In [1]:
# =========================================================
# Silver Layer – Dimension Build (Players)
# Purpose:
# - Clean and standardize Bronze NHL data
# - Produce Silver DataFrames (no persistence here)
# =========================================================

from pyspark.sql import functions as F
from pyspark.sql.functions import col, trim, initcap
import re


StatementMeta(, 47def893-c044-4171-8f30-020968999e51, 3, Finished, Available, Finished)

In [2]:
# ============================================
# Load Bronze PLAYER table
# ============================================

bronze_player_df = spark.read.table(
    "nhl_lakehouse_bronze.nhl_player_info"
)




StatementMeta(, 47def893-c044-4171-8f30-020968999e51, 4, Finished, Available, Finished)

In [3]:
# Data exploration
# ---------------------------------------------
# Schema inspection: Player 
# ---------------------------------------------

print("=== bronze_player_df schema ===")
bronze_player_df.printSchema()


StatementMeta(, 47def893-c044-4171-8f30-020968999e51, 5, Finished, Available, Finished)

=== bronze_player_df schema ===
root
 |-- player_id: integer (nullable = true)
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- birthcity: string (nullable = true)
 |-- primaryposition: string (nullable = true)
 |-- birthdate: timestamp (nullable = true)
 |-- birthstateprovince: string (nullable = true)
 |-- height: string (nullable = true)
 |-- height_cm: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- shootscatches: string (nullable = true)



In [4]:
# ============================================================
# Cell 3: Column Name Standardisation (Bronze → Analytics-safe)
#
# Purpose:
#   - Enforce a consistent column naming standard across ALL
#     bronze dataframes before any business logic is applied.
#
# Why this is done here (and only once):
#   1. Column name cleaning is a STRUCTURAL concern, not
#      a business rule.
#   2. Doing this early prevents:
#        - case-sensitivity bugs
#        - join errors due to special characters
#        - inconsistent naming in Silver / Gold layers
#   3. Silver transformations should focus on DATA VALUES
#      and BUSINESS LOGIC, not schema hygiene.
#
# What this does:
#   - lowercases all column names
#   - replaces special characters with underscores
#   - removes duplicate underscores
#   - ensures snake_case consistency
#
# Example:
#   "Team Name"   → "team_name"
#   "Birth-Date" → "birth_date"
#
# This is applied ONCE and reused downstream.
# ============================================================

def clean_column(col_name: str) -> str:
    col_name = col_name.lower()
    col_name = re.sub(r"[^a-z0-9_]", "_", col_name)
    col_name = re.sub(r"_+", "_", col_name)
    return col_name.strip("_")

# Apply column cleaning 
bronze_player_df = bronze_player_df.toDF(*[clean_column(c) for c in bronze_player_df.columns])

StatementMeta(, 47def893-c044-4171-8f30-020968999e51, 6, Finished, Available, Finished)

This step builds the Silver Player dimension from bronze player data.

Goals:
- Uses `firstname` / `lastname` from bronze (already column-cleaned)
- Standardises name casing and trimming
- Derives `is_goalie` from `primaryposition`
- Converts weight to kilograms
- Preserves existing height_cm from bronze
- Enforces one row per player (player grain)


No joins are performed at this stage.
Team relationships and business questions are handled later in Gold.


In [5]:
# ============================================================
# Silver Layer – Player Dimension (Canonical Build)
# Purpose:
# - Standardise player attributes
# - Convert physical attributes to analytics-ready types
# ============================================================

from pyspark.sql import functions as F

# ------------------------------------------------------------
# Read Bronze player table
# ------------------------------------------------------------
player_df = spark.table("nhl_lakehouse_bronze.nhl_player_info")

# ------------------------------------------------------------
# Build Silver player DataFrame
# ------------------------------------------------------------
silver_player_df = (
    player_df

    # -------------------------
    # Standardise names
    # -------------------------
    .withColumn("first_name", F.initcap(F.trim(F.col("firstname"))))
    .withColumn("last_name",  F.initcap(F.trim(F.col("lastname"))))

    # -------------------------
    # Derive goalie flag
    # -------------------------
    .withColumn(
        "is_goalie",
        F.col("primaryposition") == F.lit("G")
    )

    # -------------------------
    # HEIGHT: string → int (cm)
    # Bronze already has height_cm as string/decimal
    # Example: "185.42" → 185
    # -------------------------
    .withColumn(
        "height_cm",
        F.round(F.col("height_cm").cast("double")).cast("int")
    )

    # -------------------------
    # WEIGHT: lbs → kg → int
    # NHL source weight is in pounds
    # -------------------------
    .withColumn(
        "weight_kg",
        F.round(F.col("weight").cast("double") * 0.453592).cast("int")
    )

    # -------------------------
    # Select Silver schema
    # -------------------------
    .select(
        F.col("player_id").cast("string"),
        "first_name",
        "last_name",
        F.col("primaryposition").alias("position"),
        F.col("shootscatches").alias("shoots_catches"),
        "is_goalie",
        "height_cm",
        "weight_kg",
        F.col("birthdate").cast("date").alias("birth_date"),
        "nationality"
    )
)


StatementMeta(, 47def893-c044-4171-8f30-020968999e51, 7, Finished, Available, Finished)

In [6]:
# ============================================================
# Validation A: Schema and row count sanity
# Purpose:
#   - Confirm expected columns exist
#   - Confirm player count is reasonable and stable
# ============================================================

silver_player_df.printSchema()
silver_player_df.count()


StatementMeta(, 47def893-c044-4171-8f30-020968999e51, 8, Finished, Available, Finished)

root
 |-- player_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- position: string (nullable = true)
 |-- shoots_catches: string (nullable = true)
 |-- is_goalie: boolean (nullable = true)
 |-- height_cm: integer (nullable = true)
 |-- weight_kg: integer (nullable = true)
 |-- birth_date: date (nullable = true)
 |-- nationality: string (nullable = true)



3925

In [7]:
# ============================================================
# Validation B: Primary key uniqueness
# Rule:
#   - Exactly one row per player_id
# ============================================================

(
    silver_player_df
    .groupBy("player_id")
    .count()
    .filter(F.col("count") > 1)
    .show()
)


StatementMeta(, 47def893-c044-4171-8f30-020968999e51, 9, Finished, Available, Finished)

+---------+-----+
|player_id|count|
+---------+-----+
+---------+-----+



In [8]:
# ============================================================
# Validation C: Mandatory field null checks
# ============================================================

mandatory_fields = [
    "player_id",
    "first_name",
    "last_name",
    "position",
    "is_goalie"
]

for field in mandatory_fields:
    null_count = silver_player_df.filter(F.col(field).isNull()).count()
    print(f"{field}: {null_count} nulls")


StatementMeta(, 47def893-c044-4171-8f30-020968999e51, 10, Finished, Available, Finished)

player_id: 0 nulls


first_name: 0 nulls


last_name: 0 nulls
position: 0 nulls


is_goalie: 0 nulls


In [9]:
# ============================================================
# Validation D: Position domain enforcement
# ============================================================

silver_player_df.groupBy("position").count().orderBy("position").show()


StatementMeta(, 47def893-c044-4171-8f30-020968999e51, 11, Finished, Available, Finished)

+--------+-----+
|position|count|
+--------+-----+
|       C|  932|
|       D| 1208|
|       G|  434|
|      LW|  709|
|      RW|  642|
+--------+-----+



        #What each value means (domain knowledge)
| Code   | Meaning    | Role                          |
| ------ | ---------- | ----------------------------- |
| **C**  | Center     | Offensive playmaker, faceoffs |
| **LW** | Left Wing  | Attacker (left side)          |
| **RW** | Right Wing | Attacker (right side)         |
| **D**  | Defenseman | Defensive role                |
| **G**  | Goalie     | Goal prevention               |
#Attributes: Does not change per game, Should live in the player dimension, Should NOT be derived from game stats repeatedly

In [10]:
# ============================================================
# Validation E: shoots_catches domain check
# ============================================================

silver_player_df.groupBy("shoots_catches").count().orderBy("shoots_catches").show()


StatementMeta(, 47def893-c044-4171-8f30-020968999e51, 12, Finished, Available, Finished)

+--------------+-----+
|shoots_catches|count|
+--------------+-----+
|             L| 2594|
|            NA|   17|
|             R| 1314|
+--------------+-----+



**Value	Meaning (Domain Knowledge)
L	Left-handed shooter (or goalie catches left)
R	Right-handed shooter (or goalie catches right)
NA	Missing / unknown**

**How to interpret this

Left-handed players are more common → normal in hockey

Only 17 NA → very low data quality issue

This is a biological / physical attribute

It does not vary by game**

Case 1 — Logical contradiction

“This player plays Goalie, but we say they are NOT a goalie”

Case 2 — Logical contradiction

“This player does NOT play Goalie, but we say they ARE a goalie”

Expect 0 return for consistency

In [11]:
# ============================================================
# Validation F: Position ↔ Goalie consistency
# ============================================================

silver_player_df.filter(
    (F.col("position") == "G") & (F.col("is_goalie") == False)
).show()

silver_player_df.filter(
    (F.col("position") != "G") & (F.col("is_goalie") == True)
).show()


StatementMeta(, 47def893-c044-4171-8f30-020968999e51, 13, Finished, Available, Finished)

+---------+----------+---------+--------+--------------+---------+---------+---------+----------+-----------+
|player_id|first_name|last_name|position|shoots_catches|is_goalie|height_cm|weight_kg|birth_date|nationality|
+---------+----------+---------+--------+--------------+---------+---------+---------+----------+-----------+
+---------+----------+---------+--------+--------------+---------+---------+---------+----------+-----------+

+---------+----------+---------+--------+--------------+---------+---------+---------+----------+-----------+
|player_id|first_name|last_name|position|shoots_catches|is_goalie|height_cm|weight_kg|birth_date|nationality|
+---------+----------+---------+--------+--------------+---------+---------+---------+----------+-----------+
+---------+----------+---------+--------+--------------+---------+---------+---------+----------+-----------+



Show if any players whose height or weight is zero or negative. Expect 0 return.

In [12]:
# ============================================================
# Validation G: Physical attribute sanity checks
# ============================================================

silver_player_df.filter(
    (F.col("height_cm") <= 0) | (F.col("weight_kg") <= 0)
).show()


StatementMeta(, 47def893-c044-4171-8f30-020968999e51, 14, Finished, Available, Finished)

+---------+----------+---------+--------+--------------+---------+---------+---------+----------+-----------+
|player_id|first_name|last_name|position|shoots_catches|is_goalie|height_cm|weight_kg|birth_date|nationality|
+---------+----------+---------+--------+--------------+---------+---------+---------+----------+-----------+
+---------+----------+---------+--------+--------------+---------+---------+---------+----------+-----------+



In [16]:
# ============================================================
# Validation H: Visual Check & find out total unique players by positions. 
# ============================================================

silver_player_df.orderBy("player_id").show(10, truncate=False)
silver_player_df.groupBy("position") \
    .agg(
        F.count("*").alias("rows"),
        F.countDistinct("player_id").alias("unique_players")
    ) \
    .show()




StatementMeta(, 47def893-c044-4171-8f30-020968999e51, 15, Finished, Available, Finished)

+---------+----------+----------+--------+--------------+---------+---------+---------+----------+-----------+
|player_id|first_name|last_name |position|shoots_catches|is_goalie|height_cm|weight_kg|birth_date|nationality|
+---------+----------+----------+--------+--------------+---------+---------+---------+----------+-----------+
|8444894  |Greg      |Adams     |LW      |L             |false    |193      |89       |1963-08-15|CAN        |
|8444919  |Tommy     |Albelin   |D       |L             |false    |188      |88       |1964-05-21|SWE        |
|8445000  |Dave      |Andreychuk|LW      |R             |false    |193      |102      |1963-09-29|CAN        |
|8445176  |Donald    |Audette   |RW      |R             |false    |173      |87       |1969-09-23|CAN        |
|8445266  |Murray    |Baron     |D       |L             |false    |191      |107      |1967-06-01|CAN        |
|8445275  |Tom       |Barrasso  |G       |R             |true     |191      |95       |1965-03-31|USA        |
|

+--------+----+--------------+
|position|rows|unique_players|
+--------+----+--------------+
|      LW| 709|           709|
|       D|1208|          1208|
|       C| 932|           932|
|      RW| 642|           642|
|       G| 434|           434|
+--------+----+--------------+



In [13]:
## Canonical Silver write (player)
(
  silver_player_df
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("NHL_Lakehouse_Silver.silver_players")
)


StatementMeta(, 47def893-c044-4171-8f30-020968999e51, 16, Finished, Available, Finished)


 EXPLAINER: What can be analysed with silver_team_df and
            silver_player_df

 Scope clarification:
   These datasets represent the SILVER layer.
   They provide clean, validated, descriptive entities.
   They do NOT answer business performance questions.

 
 Analysis possible with silver_player_df ONLY

   - Player demographics (nationality, age, physical attributes)
   - Position distribution (C, LW, RW, D, G)
   - Goalie vs skater population
   - Left vs right handedness
   - Data quality checks (nulls, outliers, inconsistencies)


 



