In [0]:
# ========================================
# CONFIGURACIÃ“N STORAGE CON MANAGED IDENTITY
# ========================================

storage_account = "adbdatalake01111"

spark.conf.set(
    f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net",
    "OAuth"
)

spark.conf.set(
    f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net",
    "org.apache.hadoop.fs.azurebfs.oauth2.MsiTokenProvider"
)

silver_path = f"abfss://silver@{storage_account}.dfs.core.windows.net/movies"
gold_path = f"abfss://golden@{storage_account}.dfs.core.windows.net/movies"


In [0]:
silver_movies_df = spark.read.parquet(
    f"{silver_path}/silver_movies"
)


In [0]:
from pyspark.sql.functions import year, to_date

silver_movies_df = silver_movies_df.withColumn(
    "release_year",
    year(to_date("release_date"))
)


In [0]:
from pyspark.sql.functions import count, avg

movies_kpi_df = silver_movies_df.agg(
    count("*").alias("total_movies"),
    avg("rating").alias("avg_rating"),
    avg("runtime").alias("avg_runtime")
)


In [0]:
from pyspark.sql.functions import explode, split

movies_by_genre_df = (
    silver_movies_df
    .withColumn("genre", explode(split("genres", ",")))
    .groupBy("genres")
    .agg(
        count("*").alias("movies_count"),
        avg("rating").alias("avg_rating")
    )
    .orderBy("movies_count", ascending=False)
)


In [0]:
top_rated_movies_df = (
    silver_movies_df
    .filter(silver_movies_df.rating.isNotNull())
    .orderBy("rating", ascending=False)
    .limit(20)
)


In [0]:
movies_by_year_df = (
    silver_movies_df
    .filter(silver_movies_df.release_year.isNotNull())
    .groupBy("release_year")
    .agg(
        count("*").alias("movies_count"),
        avg("rating").alias("avg_rating")
    )
    .orderBy("release_year")
)


In [0]:
movies_kpi_df.write.mode("overwrite").parquet(
    f"{gold_path}/movies_kpi"
)

movies_by_genre_df.write.mode("overwrite").parquet(
    f"{gold_path}/movies_by_genre"
)

top_rated_movies_df.write.mode("overwrite").parquet(
    f"{gold_path}/top_rated_movies"
)

movies_by_year_df.write.mode("overwrite").parquet(
    f"{gold_path}/movies_by_year"
)
