In [None]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import coalesce
from datetime import datetime

args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

current_date = datetime.now().strftime('%Y-%m-%d')

dim_filme_path = f"s3://data-lake-mateus/Refined/PARQUET/dim_filme/{current_date}/"
dim_artista_path = f"s3://data-lake-mateus/Refined/PARQUET/dim_artista/{current_date}/"
fato_filme_path = f"s3://data-lake-mateus/Refined/PARQUET/fato_filme/{current_date}/"


df_tmdb = spark.read.option("header", "true").parquet("s3://data-lake-mateus/Trusted/TMDB/PARQUET/2025-01-07/")
df_imdb = spark.read.option("header", "true").parquet("s3://data-lake-mateus/Trusted/Local/PARQUET/Movies/2025-01-05/")


tabelao = df_imdb.join(
    df_tmdb, 
    (df_imdb.tituloOriginal == df_tmdb.titulo) & 
    (df_imdb.anoLancamento == df_tmdb.ano_lancamento), 
    "right_outer"
)

tabelao_unificado = tabelao.select(
    coalesce(df_imdb["id"], df_tmdb["id"]).alias("id_filme"),
    coalesce(df_imdb["tituloOriginal"], df_tmdb["titulo"]).alias("tituloFilme"),
    coalesce(df_imdb["anoLancamento"], df_tmdb["ano_lancamento"]).alias("anoDeLancamento"),
    coalesce(df_imdb["genero"], df_tmdb["genero_nome"]).alias("generoFilme"),
    df_imdb["notaMedia"],
    df_imdb["numeroVotos"],
    df_imdb["generoArtista"],
    df_imdb["nomeArtista"],
    df_imdb["personagem"],
    df_imdb["anoNascimento"],
    df_imdb["anoFalecimento"],
    df_tmdb["estudio"],
    df_tmdb["orcamento"],
    df_tmdb["arrecadacao"]
)


tabelao_final = tabelao_unificado.filter(tabelao_unificado["generoFilme"].contains("Animation"))
tabelao_final = tabelao_final.dropDuplicates(["tituloFilme", "anoDeLancamento"])




dim_filme = tabelao_final.select(
    "id_filme", "tituloFilme", "generoFilme", 
    "estudio"
).distinct()

dim_filme.write.mode("overwrite").parquet(dim_filme_path)


dim_artista = tabelao_final.select(
   "id_filme", "nomeArtista", "personagem", "generoArtista", "anoNascimento", "anoFalecimento"
).distinct()

dim_artista.write.mode("overwrite").parquet(dim_artista_path)


tabela_fato = tabelao_final.select(
    "id_filme", "anoDeLancamento", "notaMedia", "numeroVotos", "orcamento", "arrecadacao"
).distinct()

tabela_fato.write.mode("overwrite").parquet(fato_filme_path)



job.commit()