In [0]:
from pyspark.sql.functions import udf, when, col, regexp_extract
from pyspark.sql.types import StringType
import uuid

def generate_uuid():
    return str(uuid.uuid4())
uuid_udf = udf(generate_uuid, StringType())

df_participation = spark.read.table("imdb_dev.bronze.title_principals")

df_person_deleted = spark.read.table("imdb_dev.bronze.name_basics")
df_person_deleted = df_person_deleted.filter((col("primaryName") == "\\N"))

#81% of jobs empty therefore column dropped, 51% characters empty due to being not applicable - set to "Unnamed Background Character"
df_participation = (
    df_participation.drop("job", "ordering")
    .withColumn("participationId", uuid_udf())
    .join(df_person_deleted.select("nconst"), on="nconst", how="left_anti")
    .withColumnRenamed("tconst", "titleId")
    .withColumnRenamed("nconst", "personId")
    .withColumnRenamed("characters", "characterName")
    .withColumn("characterName", when(col("characterName") == "\\N", '["Unnamed Background Character"]').otherwise(col("characterName")))
    .withColumn("characterName", regexp_extract(col("characterName"), r'"\s*([^"]+)\s*"', 1))
).cache()

In [0]:
df_category = (
    df_participation.select("category")
    .distinct()
    .withColumn("categoryId", uuid_udf())
)
df_category = df_category.select("categoryId", "category")
df_category.write.format("delta").mode("overwrite").insertInto("imdb_dev.silver.Category")

In [0]:
df_category = spark.read.table("imdb_dev.silver.Category")

df_participation = (
    df_participation.join(df_category, on="category", how="left")
    .drop("category")
)
df_participation = df_participation.select("participationId", "titleId", "personId", "categoryId", "characterName")
df_participation.write.format("delta").mode("overwrite").insertInto("imdb_dev.silver.Participation")