In [10]:
# Setup environment
from pyspark.sql import Row
from pyspark.sql.functions import col

StatementMeta(, d46cd86a-7df4-4f6a-b482-3832441b40c9, 12, Finished, Available, Finished)

In [11]:
# Read in data from delta tables
table_of_interest = "NHL_Lakehouse_Bronze.nhl_team_info" # Define table of interest
df = spark.read.table(table_of_interest)

StatementMeta(, d46cd86a-7df4-4f6a-b482-3832441b40c9, 13, Finished, Available, Finished)

In [12]:
print(df.count())
df = df.withColumn("team_id", col("team_id").cast("string"))

StatementMeta(, d46cd86a-7df4-4f6a-b482-3832441b40c9, 14, Finished, Available, Finished)

33


In [13]:
# Declare missing teamid
missing_team_ids =    ["87", "88", "89", "90","NA"]

StatementMeta(, d46cd86a-7df4-4f6a-b482-3832441b40c9, 15, Finished, Available, Finished)

In [14]:
df.columns
df.printSchema()

StatementMeta(, d46cd86a-7df4-4f6a-b482-3832441b40c9, 16, Finished, Available, Finished)

root
 |-- team_id: string (nullable = true)
 |-- franchiseid: integer (nullable = true)
 |-- shortname: string (nullable = true)
 |-- teamname: string (nullable = true)
 |-- abbreviation: string (nullable = true)
 |-- link: string (nullable = true)



In [15]:
# Your list of missing ids (already defined)
# missing_team_ids = [...]

# 1) Build a DF with one row per missing play_id:
#    - team_id filled
#    - teamname = "Missing Details"
#    - everything else = NULL
cols = df.columns

rows = [
    Row(
        **{
            c: (
                pid if c == "team_id"
                else "Missing Details" if c == "teamname"
                else None
            )
            for c in cols
        }
    )
    for pid in missing_team_ids
]

missing_df = spark.createDataFrame(rows, schema=df.schema)

# 2) (Recommended) Only keep truly missing IDs to avoid duplicates
existing_ids = df.select("team_id").distinct()
missing_df = missing_df.join(existing_ids, on="team_id", how="left_anti")

# 3) Append the records
df_final = df.unionByName(missing_df)

StatementMeta(, d46cd86a-7df4-4f6a-b482-3832441b40c9, 17, Finished, Available, Finished)

In [16]:
from pyspark.sql.functions import col

df_final.filter(
    col("team_id").isin("87", "88", "89", "90", "NA")
).show(truncate=False)



StatementMeta(, d46cd86a-7df4-4f6a-b482-3832441b40c9, 18, Finished, Available, Finished)

+-------+-----------+---------+---------------+------------+----+
|team_id|franchiseid|shortname|teamname       |abbreviation|link|
+-------+-----------+---------+---------------+------------+----+
|87     |NULL       |NULL     |Missing Details|NULL        |NULL|
|88     |NULL       |NULL     |Missing Details|NULL        |NULL|
|89     |NULL       |NULL     |Missing Details|NULL        |NULL|
|90     |NULL       |NULL     |Missing Details|NULL        |NULL|
|NA     |NULL       |NULL     |Missing Details|NULL        |NULL|
+-------+-----------+---------+---------------+------------+----+



In [17]:
print(df_final.tail(12))
print(df.count())
print(df_final.count())
df.printSchema()

StatementMeta(, d46cd86a-7df4-4f6a-b482-3832441b40c9, 19, Finished, Available, Finished)

[Row(team_id='29', franchiseid=36, shortname='Columbus', teamname='Blue Jackets', abbreviation='CBJ', link='/api/v1/teams/29'), Row(team_id='52', franchiseid=35, shortname='Winnipeg', teamname='Jets', abbreviation='WPG', link='/api/v1/teams/52'), Row(team_id='22', franchiseid=25, shortname='Edmonton', teamname='Oilers', abbreviation='EDM', link='/api/v1/teams/22'), Row(team_id='54', franchiseid=38, shortname='Vegas', teamname='Golden Knights', abbreviation='VGK', link='/api/v1/teams/54'), Row(team_id='12', franchiseid=26, shortname='Carolina', teamname='Hurricanes', abbreviation='CAR', link='/api/v1/teams/12'), Row(team_id='53', franchiseid=28, shortname='Arizona', teamname='Coyotes', abbreviation='ARI', link='/api/v1/teams/53'), Row(team_id='11', franchiseid=35, shortname='Atlanta', teamname='Thrashers', abbreviation='ATL', link='/api/v1/teams/11'), Row(team_id='87', franchiseid=None, shortname=None, teamname='Missing Details', abbreviation=None, link=None), Row(team_id='88', franchis

In [18]:
# Write cleaned data into lakehouse

( df_final
    .write.format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("NHL_Lakehouse_Silver.silver_teams")
)

StatementMeta(, d46cd86a-7df4-4f6a-b482-3832441b40c9, 20, Finished, Available, Finished)