In [None]:

# =========================================================
# Silver Layer – Dimension Build (Teams)
# Purpose:
# - Clean and standardize Bronze NHL data
# - Produce Silver DataFrames (no persistence here)
# =========================================================

from pyspark.sql import functions as F
from pyspark.sql.functions import col, trim, initcap
import re


StatementMeta(, 22608885-3706-4c5b-ab68-57e8d17f21ce, 19, Finished, Available, Finished)

In [None]:
# ============================================
# Load Bronze TEAM table
# ============================================

bronze_team_df = spark.read.table(
   "nhl_lakehouse_silver.silver_teams"
#     "nhl_lakehouse_bronze.nhl_team_info"
)


StatementMeta(, 22608885-3706-4c5b-ab68-57e8d17f21ce, 20, Finished, Available, Finished)

In [None]:
# Data exploration
# ---------------------------------------------
# Schema inspection:  Team
# ---------------------------------------------

print("\n=== bronze_team_df schema ===")
bronze_team_df.printSchema()


StatementMeta(, 22608885-3706-4c5b-ab68-57e8d17f21ce, 21, Finished, Available, Finished)


=== bronze_team_df schema ===
root
 |-- team_id: string (nullable = true)
 |-- franchiseid: integer (nullable = true)
 |-- shortname: string (nullable = true)
 |-- teamname: string (nullable = true)
 |-- abbreviation: string (nullable = true)
 |-- link: string (nullable = true)



In [None]:
# ============================================================
# Cell 3: Column Name Standardisation (Bronze → Analytics-safe)
#
# Purpose:
#   - Enforce a consistent column naming standard across ALL
#     bronze dataframes before any business logic is applied.
#
# Why this is done here (and only once):
#   1. Column name cleaning is a STRUCTURAL concern, not
#      a business rule.
#   2. Doing this early prevents:
#        - case-sensitivity bugs
#        - join errors due to special characters
#        - inconsistent naming in Silver / Gold layers
#   3. Silver transformations should focus on DATA VALUES
#      and BUSINESS LOGIC, not schema hygiene.
#
# What this does:
#   - lowercases all column names
#   - replaces special characters with underscores
#   - removes duplicate underscores
#   - ensures snake_case consistency
#
# Example:
#   "Team Name"   → "team_name"
#   "Birth-Date" → "birth_date"
#
# This is applied ONCE and reused downstream.
# ============================================================

def clean_column(col_name: str) -> str:
    col_name = col_name.lower()
    col_name = re.sub(r"[^a-z0-9_]", "_", col_name)
    col_name = re.sub(r"_+", "_", col_name)
    return col_name.strip("_")

# Apply column cleaning consistently across all bronze dataframes
bronze_team_df = bronze_team_df.toDF(*[clean_column(c) for c in bronze_team_df.columns])



StatementMeta(, 22608885-3706-4c5b-ab68-57e8d17f21ce, 22, Finished, Available, Finished)

This step derives the `is_active` flag for teams based on factual evidence
rather than assumptions.

A team is considered ACTIVE if it appears in at least one recorded game
(either as a home team or an away team). Teams with no associated games
are marked as inactive.

Why this is done here (Silver layer):
- Prevents assuming all teams are active
- Identifies orphan / inactive teams safely
- Makes `is_active` a derived attribute, not a guessed one
- Ensures downstream joins (facts, Gold layer) are reliable and auditable

This logic intentionally does NOT rely on conference or division columns,
as no validated relationship to the source CSVs was found.


In [None]:
from pyspark.sql import functions as F

# Read bronze tables
team_df = spark.table("nhl_lakehouse_bronze.nhl_team_info")
game_df = spark.table("nhl_lakehouse_bronze.nhl_game")

# Join teams to games (home or away)
team_game_df = team_df.join(
    game_df,
    (team_df.team_id == game_df.home_team_id) |
    (team_df.team_id == game_df.away_team_id),
    how="left"
)

# Derive is_active based on existence of games
silver_team_df = (
    team_game_df
    .groupBy(
        team_df.team_id,
        team_df.teamname,
        team_df.abbreviation,
        team_df.shortname
    )
    .agg(
        F.count(game_df.game_id).alias("game_count")
    )
    .withColumn(
        "is_active",
        F.col("game_count") > 0
    )
    .select(
        F.col("team_id"),
        F.col("teamname").alias("team_name"),
        F.col("abbreviation").alias("team_abbreviation"),
        F.col("shortname").alias("city"),
        F.col("is_active")
    )
)


StatementMeta(, 22608885-3706-4c5b-ab68-57e8d17f21ce, 23, Finished, Available, Finished)

### The below validation checks three things:

1. Total number of distinct teams in the derived Silver dataframe
   - Confirms expected NHL team count (33)

2. Presence of inactive teams
   - Ensures `is_active` is derived correctly
   - Prevents assuming all teams are active

3. Team ID integrity
   - Explains why team_id values are non-sequential
   - Confirms IDs are source-system identifiers, not row indexes


In [None]:
from pyspark.sql import functions as F

# 1. Total number of teams
silver_team_df.select("team_id").distinct().count()


StatementMeta(, 22608885-3706-4c5b-ab68-57e8d17f21ce, 24, Finished, Available, Finished)

33

In [None]:
# 2. Active vs inactive teams breakdown
silver_team_df.groupBy("is_active").count().orderBy("is_active").show()


StatementMeta(, 22608885-3706-4c5b-ab68-57e8d17f21ce, 25, Finished, Available, Finished)

+---------+-----+
|is_active|count|
+---------+-----+
|     true|   33|
+---------+-----+



In [None]:
# 3. List inactive teams (if any)
silver_team_df.filter(F.col("is_active") == False).show(truncate=False)


StatementMeta(, 22608885-3706-4c5b-ab68-57e8d17f21ce, 26, Finished, Available, Finished)

+-------+---------+-----------------+----+---------+
|team_id|team_name|team_abbreviation|city|is_active|
+-------+---------+-----------------+----+---------+
+-------+---------+-----------------+----+---------+



In [None]:
# 4. Check team_id values (why they are not sequential)
silver_team_df.select("team_id").orderBy("team_id").show(5)


StatementMeta(, 22608885-3706-4c5b-ab68-57e8d17f21ce, 27, Finished, Available, Finished)

+-------+
|team_id|
+-------+
|      1|
|      2|
|      3|
|      4|
|      5|
+-------+
only showing top 5 rows



The team_id column is NOT an index.

It is a source-system identifier coming from the NHL data,
which explains why IDs:
- Are not sequential
- May skip numbers (e.g. expansion, relocation, retired teams)

Row position ≠ team_id.
The correct team count is validated via DISTINCT team_id = 32.


In [None]:
silver_team_df.show(5)

StatementMeta(, 22608885-3706-4c5b-ab68-57e8d17f21ce, 28, Finished, Available, Finished)

+-------+---------+-----------------+----------+---------+
|team_id|team_name|team_abbreviation|      city|is_active|
+-------+---------+-----------------+----------+---------+
|     19|    Blues|              STL|  St Louis|     true|
|      1|   Devils|              NJD|New Jersey|     true|
|      9| Senators|              OTT|    Ottawa|     true|
|     11|Thrashers|              ATL|   Atlanta|     true|
|     17|Red Wings|              DET|   Detroit|     true|
+-------+---------+-----------------+----------+---------+
only showing top 5 rows



In [None]:
silver_team_df.printSchema()


StatementMeta(, 22608885-3706-4c5b-ab68-57e8d17f21ce, 29, Finished, Available, Finished)

root
 |-- team_id: integer (nullable = true)
 |-- team_name: string (nullable = true)
 |-- team_abbreviation: string (nullable = true)
 |-- city: string (nullable = true)
 |-- is_active: boolean (nullable = false)



In [None]:
# ============================================================
# Cell 4: Build SILVER TEAM dimension
#
# Grain:
#   - One row per team (current-state snapshot)
#
# Inclusion criteria:
#   - Only columns that are validated across the 13 CSV files
#   - No conference / division (not joinable or derivable)
#
# Rules applied here:
#   ✓ Select required attributes only
#   ✓ Standardise TEXT VALUES (not column names)
#   ✓ Deduplicate on team_id
#   ✓ Add is_active as a current-state flag
#
# NOTE:
#   - No joins
#   - No business filters
#   - No assumptions beyond the dataset
# ============================================================

from pyspark.sql import functions as F

silver_team_df = (
    bronze_team_df
        .select(
            F.col("team_id").cast("string").alias("team_id"),  
            F.initcap(F.trim("teamname")).alias("team_name"),
            F.upper(F.trim("abbreviation")).alias("team_abbreviation"),
            F.initcap(F.trim("shortname")).alias("city")
        )
        .dropDuplicates(["team_id"])
        .withColumn("is_active", F.lit(True))
)

silver_team_df.printSchema()




StatementMeta(, 22608885-3706-4c5b-ab68-57e8d17f21ce, 30, Finished, Available, Finished)

root
 |-- team_id: string (nullable = true)
 |-- team_name: string (nullable = true)
 |-- team_abbreviation: string (nullable = true)
 |-- city: string (nullable = true)
 |-- is_active: boolean (nullable = false)



In [None]:
##Canonical Silver write (team)

silver_team_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("NHL_Lakehouse_Silver.silver_teams")


StatementMeta(, 22608885-3706-4c5b-ab68-57e8d17f21ce, 31, Finished, Available, Finished)

EXPLAINER: What can be analysed with silver_team_df 

Scope clarification: These datasets represent the SILVER layer. They provide clean, validated, descriptive entities. They do NOT answer business performance questions.

Analysis possible with silver_team_df ONLY

Count of total teams
Identification of inactive / orphan teams
Team naming and abbreviation consistency checks
Safe referential base for future fact joins

In [None]:
spark.read.table("NHL_Lakehouse_Silver.silver_teams").printSchema()


StatementMeta(, 22608885-3706-4c5b-ab68-57e8d17f21ce, 32, Finished, Available, Finished)

root
 |-- team_id: string (nullable = true)
 |-- team_name: string (nullable = true)
 |-- team_abbreviation: string (nullable = true)
 |-- city: string (nullable = true)
 |-- is_active: boolean (nullable = true)

