In [0]:
from pyspark.sql.functions import explode, split, udf, when, col, regexp_extract, mean, stddev, lit
from pyspark.sql.types import StringType, IntegerType
import random
import uuid

def generate_uuid():
    return str(uuid.uuid4())
uuid_udf = udf(generate_uuid, StringType())

def random_birth_year(mean_val, std_val):
    return int(random.normalvariate(mean_val, std_val))
random_birth_year_udf = udf(random_birth_year, IntegerType())

def random_lifespan(mean_life, std_life):
    lifespan = int(random.normalvariate(mean_life, std_life))
    return max(lifespan, 35)
random_lifespan_udf = udf(random_lifespan, IntegerType())

df = spark.read.table("imdb_dev.bronze.name_basics")
df = ( 
    df.withColumnRenamed("nconst", "personId")
    .withColumn("primaryProfession", split("primaryProfession", ","))
    .withColumn("knownForTitles", split("knownForTitles", ","))
    .withColumn("birthYear", col("birthYear").cast("int"))
    .withColumn("deathYear", col("deathYear").cast("int"))
    .filter((col("primaryName") != "\\N")) #53 records removed
)

stats = df.select(mean("birthYear").alias("mean"), stddev("birthYear").alias("stddev")).collect()[0]
mean_birth = stats["mean"]
std_birth = stats["stddev"]

df_lifespan = (
    df.filter(col("deathYear").isNotNull() & col("birthYear").isNotNull()) 
    .withColumn("lifespan", col("deathYear") - col("birthYear"))
)
stats = df_lifespan.select(mean("lifespan").alias("mean"), stddev("lifespan").alias("stddev")).collect()[0]
mean_lifespan = stats["mean"]
std_lifespan = stats["stddev"]

In [0]:
df_professions = (
    df.select(explode("primaryProfession").alias("profession"))
    .distinct()
    .withColumn("professionId", uuid_udf())
    .withColumn("profession", when(col("profession") == "\\N", "Other").otherwise(col("profession")))
)
df_professions = df_professions.select("professionId", "profession")
df_professions.write.format("delta").mode("overwrite").insertInto("imdb_dev.silver.Profession")

In [0]:
df_professions = spark.read.table("imdb_dev.silver.Profession")

df_person_primary_professions = (
    df.select("personId", explode("primaryProfession").alias("profession"))
    .withColumn("profession", when(col("profession") == "\\N", "Other").otherwise(col("profession")))
    .join(df_professions, on="profession", how="left")
    .drop("profession")
) 
df_person_primary_professions.write.format("delta").mode("overwrite").insertInto("imdb_dev.silver.PersonPrimaryProfession")

In [0]:
df_known_for_titles = df.select(["personId", explode("knownForTitles").alias("titleId")])
df_known_for_titles.write.format("delta").mode("overwrite").insertInto("imdb_dev.silver.KnownForParticipation")

In [0]:
df_person = (
    df.withColumnRenamed("nconst", "personId")
    .withColumnRenamed("primaryName", "personName")
    .withColumn("birthYear", when(col("birthYear").isNull(), random_birth_year_udf(lit(mean_birth), lit(std_birth))).otherwise(col("birthYear")))
    .withColumn("deathYear", when(col("deathYear").isNull(), col("birthYear") + random_lifespan_udf(lit(mean_lifespan), lit(std_lifespan))).otherwise(col("deathYear")))
    .withColumn("deathYear", when(col("deathYear") > lit(2024), None).otherwise(col("deathYear")))
    .drop("primaryProfession", "knownForTitles")    
) #95.5 % of birth and 98.3% of death are missing values, normally columns should be dropped but I filled them with normal distribution of existing values for research purposes
df_person.write.format("delta").mode("overwrite").insertInto("imdb_dev.silver.Person")