### Leer todos los datos que son requeridos

In [0]:
dbutils.widgets.text("p_file_date", "2024-12-30")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
movies_df = spark.read.format("delta").load(f"{silver_folder_path}/movies") \
    .filter(f"file_date = '{v_file_date}'")

In [0]:
country_df = spark.read.format("delta").load(f"{silver_folder_path}/countries")

In [0]:
production_country_df = spark.read.format("delta").load(f"{silver_folder_path}/productions_countries") \
    .filter(f"file_date = '{v_file_date}'")

### Join "country" y "production_country"

In [0]:
country_prod_con_df = country_df.join(production_country_df, 
                                    country_df.country_id == production_country_df.country_id, "inner") \
                              .select(
                                  country_df.country_name, 
                                  production_country_df.movie_id)

### Join "movies_df" y "genres_mov_gen_df"

- Filtrar las películas donde su fecha de lanzamiento sea mayor o igual a 2015

In [0]:
movies_filter_df = movies_df.filter("year_release_date >= 2015")

In [0]:
results_movies_country_df = movies_filter_df.join(country_prod_con_df, movies_filter_df.movie_id == country_prod_con_df.movie_id, "inner") 

In [0]:
results_df = results_movies_country_df.select("year_release_date", "country_name", "budget", "revenue")

In [0]:
from pyspark.sql.functions import sum, desc, dense_rank

In [0]:
results_group_by_df = results_df \
    .groupBy("year_release_date", "country_name") \
    .agg(
        sum("budget").alias("total_budget"),
        sum("revenue").alias("total_revenue")
    )

In [0]:
from pyspark.sql.window import Window

In [0]:
results_dense_rank_df = Window.partitionBy("year_release_date").orderBy(desc("total_budget"), desc("total_revenue"))

In [0]:
from pyspark.sql.functions import lit

In [0]:
final_df = results_group_by_df.withColumn("rank", dense_rank().over(results_dense_rank_df)) \
    .withColumn("created_date", lit(v_file_date))

### Escribir datos en el Datalake en formato "Parquet"

In [0]:
#overwrite_partition_data(final_df, "movie_gold", "results_group_movie_country", "created_date")
merge_condition = 'tgt.year_release_date = src.year_release_date AND tgt.country_name = src.country_name AND tgt.created_date = src.created_date'

merge_delta_lake(final_df, 'movie_gold', 'results_group_movie_country', gold_folder_path, merge_condition, 'created_date')

In [0]:
%sql 
SELECT created_date, COUNT(0) 
FROM movie_gold.results_group_movie_country
GROUP BY created_date;