In [0]:
from pyspark.sql.functions import col, split, explode, to_date, trim, regexp_replace, when

DataFrame[]

* Making sure everything is set up

In [0]:
print("dim_anime")

raw_anime_df = spark.read.table("anime_warehouse.bronze.raw_anime")

silver_anime_df = (raw_anime_df
    .select(
        col("anime_id").cast("int"),
        trim(col("title")).alias("title"),
        col("score").cast("double"),
        col("rank").cast("int"),
        col("popularity").cast("int"),
        col("members").cast("long"),
        col("synopsis"),
        to_date(col("start_date"), "yyyy-MM-dd").alias("start_date"),
        to_date(col("end_date"), "yyyy-MM-dd").alias("end_date"),
        col("type"),
        col("episodes").cast("int"),
        col("image_url")
    )
    .dropDuplicates(["anime_id"])
)

silver_anime_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("anime_warehouse.silver.dim_anime")

dim_anime


In [0]:
print("dim_genres")

raw_genres_df = spark.read.table("anime_warehouse.bronze.raw_anime_genres")

# Fix stuttering 
# Clean up columns
silver_genres_df = (raw_genres_df
    .withColumn("genre_clean", regexp_replace(col("genre"), r"^(.+)\s+\1$", "$1"))
    .withColumn("genre_clean", regexp_replace(col("genre_clean"), "Theme::|Demographic::", ""))
    .select(
        col("anime_id").cast("int"),
        trim(col("genre_clean")).alias("genre")
    )
    .dropDuplicates()
)

# Save to Silver
silver_genres_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("anime_warehouse.silver.dim_genres")

dim_genres


In [0]:
print("dim_staff")

raw_staff_df = spark.read.table("anime_warehouse.bronze.raw_anime_staff")

# Explode comma-separated roles into individual rows
silver_staff_df = (raw_staff_df
    .withColumn("role_array", split(col("role"), ", "))
    .withColumn("role_exploded", explode(col("role_array")))
    .select(
        col("anime_id").cast("int"),
        col("person_id").cast("int"),
        trim(col("role_exploded")).alias("role")
    )
    .dropDuplicates()
)

# Save to Silver
silver_staff_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("anime_warehouse.silver.dim_staff")

dim_staff


In [0]:
print("dim_entities")

raw_entities_df = spark.read.table("anime_warehouse.bronze.raw_entities")

# Clean names and types
silver_entities_df = (raw_entities_df
    .select(
        col("entity_id").cast("int"),
        col("entity_type"), # character, voice_actor, studio, etc
        trim(col("name")).alias("name"),
        col("image_url")
    )
    .dropDuplicates(["entity_id"])
)

# Save to Silver
silver_entities_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("anime_warehouse.silver.dim_entities")

dim_entities


In [0]:
print("Relationship Bridges")

# Voice Actors Bridge
spark.read.table("anime_warehouse.bronze.raw_anime_voice_actors") \
    .select(
        col("character_id").cast("int"), 
        col("person_id").cast("int"), 
        trim(col("language")).alias("language")
    ) \
    .dropDuplicates() \
    .write.format("delta").mode("overwrite").saveAsTable("anime_warehouse.silver.bridge_voice_actors")

# Company Relationship Bridge
spark.read.table("anime_warehouse.bronze.raw_anime_companies") \
    .select(
        col("anime_id").cast("int"), 
        col("company_id").cast("int"), 
        trim(col("role")).alias("role")
    ) \
    .dropDuplicates() \
    .write.format("delta").mode("overwrite").saveAsTable("anime_warehouse.silver.bridge_anime_companies")

# Character Roles Bridge
spark.read.table("anime_warehouse.bronze.raw_anime_characters") \
    .select(
        col("anime_id").cast("int"), 
        col("character_id").cast("int"), 
        trim(col("role")).alias("role")
    ) \
    .dropDuplicates() \
    .write.format("delta").mode("overwrite").saveAsTable("anime_warehouse.silver.bridge_anime_characters")

print("All Silver transformations done")

Relationship Bridges
All Silver transformations done
