Create 'silver_date_dim' table:

In [1]:
from datetime import date
import pandas as pd
from pyspark.sql import functions as F

silver_tables = [
    "silver_hs2_ontario_us_trade_imports_data",
    "silver_hs2_ontario_us_trade_exports_data",
    "silver_napcs_ontario_us_trade_by_province_data",
    "silver_napcs_ontario_us_trade_by_country_data",
    "silver_napcs_ontario_us_trade_by_commodity_data",
    "silver_napcs_ontario_us_trade_price_volume_indices_data"
]

# Get the latest period date from each table
latest_period_dates = []
for table in silver_tables:
    df = spark.read.table(table)
    if "Period" in df.columns and df.count() > 0:
        latest_period_date = df.agg(F.max("Period")).collect()[0][0]
        if latest_period_date:
            latest_period_dates.append(latest_period_date)

if not latest_period_dates:
    raise ValueError("No valid 'Period' dates found in any of the silver tables.")

latest_period_date = pd.to_datetime(max(latest_period_dates))

# Create date range up to the latest period date
start_date = date(2020, 1, 1)
date_range = pd.date_range(start=start_date, end=latest_period_date, freq="D")

# Create date dimension DataFrame
date_dim_df = pd.DataFrame({
    "Date": date_range.date,
    "Year": date_range.year,
    "Year_Month_Sort": date_range.strftime("%Y%m"),
    "Quarter": "Q" + date_range.quarter.astype(str),
    "Quarter_Year": "Q" + date_range.quarter.astype(str) + " " + date_range.strftime("%Y"),
    "Month_Number": date_range.month,
    "Month_Name": date_range.strftime("%B"),
    "Month_Name_Short": date_range.strftime("%b"),
    "Month_Name_Year": date_range.strftime("%b %Y"),
})

# Save DataFrame as a managed delta table in the Silver Lakehouse
spark_df = spark.createDataFrame(date_dim_df)
spark_df.write.mode("overwrite").format("delta").saveAsTable("silver_date_dim")

print(f"\'silver_date_dim\' table created successfully. Date range: {start_date} - {latest_period_date.date()}")


StatementMeta(, a4646c4e-0f0e-46ca-9e89-a82150c1ee56, 3, Finished, Available, Finished)

'silver_date_dim' table created successfully. Date range: 2020-01-01 - 2025-08-01


Create 'silver_hs2_ontario_us_trade_combined_data' table:

In [1]:
from pyspark.sql import functions as F

# Read source tables from Silver Lakehouse
hs2_imports_df = spark.read.table("silver_hs2_ontario_us_trade_imports_data")
hs2_exports_df = spark.read.table("silver_hs2_ontario_us_trade_exports_data")

# Add 'Trade_Type' column
hs2_imports_df = hs2_imports_df.withColumn("Trade_Type", F.lit("Imports"))
hs2_exports_df = hs2_exports_df.withColumn("Trade_Type", F.lit("Exports"))

# Select relevant columns
hs2_imports_df = hs2_imports_df.select("Period", "Value_Millions", "Trade_Type")
hs2_exports_df = hs2_exports_df.select("Period", "Value_Millions", "Trade_Type")

# Union the imports and exports tables
hs2_combined_df = hs2_imports_df.unionByName(hs2_exports_df)

# Aggregate to get one row per period per trade type
hs2_combined_df = hs2_combined_df.groupBy("Period", "Trade_Type").agg(F.sum("Value_Millions").alias("Value_Millions"))

# Read date dimension table
date_dim_df = spark.read.table("silver_date_dim")

# Join table with 'silver_date_dim' table to add date context
hs2_combined_df = (hs2_combined_df.join(date_dim_df, hs2_combined_df.Period == date_dim_df.Date,"left").drop("Date"))

# Add 'Period_Sort' column for line ordering
hs2_combined_df = hs2_combined_df.withColumn("Period_Sort", F.date_format("Period", "yyyyMM").cast("int"))

# Save DataFrame as a managed delta table in the Silver Lakehouse
hs2_combined_df.write.mode("overwrite").format("delta").saveAsTable("silver_hs2_ontario_us_trade_combined_data")

print("\'silver_hs2_ontario_us_trade_combined_data\' table created successfully.")


StatementMeta(, f9ded377-0408-45e1-9044-3b6928e0f1b1, 3, Finished, Available, Finished)

'silver_hs2_ontario_us_trade_combined_data' table created successfully.


Create 'silver_ontario_us_trade_data_combined' table

In [1]:
from pyspark.sql import functions as F

# Read source tables from Silver Lakehouse
hs2_imports_df = spark.read.table("silver_hs2_ontario_us_trade_imports_data")
hs2_exports_df = spark.read.table("silver_hs2_ontario_us_trade_exports_data")

# Select relevant columns
selected_columns = [
    "Year",
    "Trade_Type",
    "Principal_Trading_Partner_State",
    "HS2_Commodity_Code",
    "HS2_Commodity_Description_Standardized",
    "Value_Millions",
]

hs2_imports_df = hs2_imports_df.select(*selected_columns)
hs2_exports_df = hs2_exports_df.select(*selected_columns)

# Union the imports and exports tables
hs2_combined_df = hs2_imports_df.unionByName(hs2_exports_df)

# Standardize the 'Trade Type' column name
hs2_combined_df = hs2_combined_df.withColumnRenamed("Trade_Type", "Trade_Type_Standardized")

# Aggregate to get total value per grouping
hs2_combined_df = (
    hs2_combined_df.groupBy(
        "Year",
        "Trade_Type_Standardized",
        "Principal_Trading_Partner_State",
        "HS2_Commodity_Code",
        "HS2_Commodity_Description_Standardized",
    )
    .agg(F.sum("Value_Millions").alias("Total_Value_Millions"))
)

# Save DataFrame as a managed delta table in the Silver Lakehouse
hs2_combined_df.write.mode("overwrite").format("delta").saveAsTable("silver_hs2_ontario_us_trade_data_combined")

print("\'silver_hs2_ontario_us_trade_data_combined\' table created successfully.")


StatementMeta(, 8e4e94d4-3a4b-43ec-819c-c7a086ac0088, 3, Finished, Available, Finished)

'silver_hs2_ontario_us_trade_data_combined' table created successfully.


Create 'partner_state_dim' table:

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.getOrCreate()

# Read source tables from Silver Lakehouse
imports_df = spark.read.table("silver_hs2_ontario_us_trade_imports_data")
exports_df = spark.read.table("silver_hs2_ontario_us_trade_exports_data")

# Select the 'Principal_Trading_Partner_State' column
imports_states = imports_df.select(col("Principal_Trading_Partner_State"))
exports_states = exports_df.select(col("Principal_Trading_Partner_State"))

# Union the imports and exports tables
union_states = imports_states.union(exports_states)

# Get distinct states
partner_state_dim = union_states.distinct()

# Save DataFrame as a managed delta table in the Silver Lakehouse
partner_state_dim.write.mode("overwrite").format("delta").saveAsTable("partner_state_dim")

print("\'partner_state_dim\' table created successfully.")

StatementMeta(, 8840b052-c2d2-4358-8e89-8a2c862eb369, 3, Finished, Available, Finished)

'partner_state_dim' table created successfully.


Create 'commodity_dim' table:

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.getOrCreate()

# Read source tables from Silver Lakehouse
imports_df = spark.read.table("silver_hs2_ontario_us_trade_imports_data")
exports_df = spark.read.table("silver_hs2_ontario_us_trade_exports_data")

# Select relevant columns
imports_commodities = imports_df.select(
    col("HS2_Commodity_Code"),
    col("HS2_Commodity_Description_Standardized")
)

exports_commodities = exports_df.select(
    col("HS2_Commodity_Code"),
    col("HS2_Commodity_Description_Standardized")
)

# Union the imports and exports tables
union_commodities = imports_commodities.union(exports_commodities)

# Get distinct combinations
commodity_dim = union_commodities.distinct()

# Save DataFrame as a managed delta table in the Silver Lakehouse
commodity_dim.write.mode("overwrite").format("delta").saveAsTable("commodity_dim")

print("\'commodity_dim\' table created successfully.")

StatementMeta(, 0d1895b0-1b33-4bf8-8a4f-e5e6f9a5b7dd, 6, Finished, Available, Finished)

'commodity_dim' table created successfully.


Create 'silver_napcs_ontario_us_trade_by_commodity_data_filtered' table:

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.getOrCreate()

# Read source table from Silver Lakehouse
df = spark.read.table("silver_napcs_ontario_us_trade_by_commodity_data")

# Filter out the 'Total of all merchandise' row
df_filtered = df.filter(col("NAPCS_Commodity_Description") != "Total of all merchandise")

# Save DataFrame as a managed delta table in the Silver Lakehouse
df_filtered.write.mode("overwrite").format("delta").saveAsTable("silver_napcs_ontario_us_trade_by_commodity_data_filtered")

print("\'silver_napcs_ontario_us_trade_by_commodity_data_filtered\' table created successfully.")


StatementMeta(, ce0d918a-fcb7-402a-bbf8-e4b3f9ed0ae7, 3, Finished, Available, Finished)

'silver_napcs_ontario_us_trade_by_commodity_data_filtered' table created successfully.
