In [None]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType, LongType
from pyspark.sql.functions import format_number, col, explode
from datetime import datetime

# @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)


schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("titulo", StringType(), True),
    StructField("ano_lancamento", StringType(), True),
    StructField("estudio", StringType(), True),
    StructField("genero", ArrayType(StringType()), True),
    StructField("diretores", StringType(), True),
    StructField("orcamento", LongType(), True),
    StructField("arrecadacao", LongType(), True)
])

current_date = datetime.now().strftime('%Y-%m-%d')

RAW_ZONE_PATH = "s3://data-lake-mateus/Raw/TMDB/JSON/2024/12/15/"
TRUSTED_ZONE_PATH = "s3://data-lake-mateus/Trusted/TMDB/PARQUET"

def process_data(schema):
    
    raw_path = f"{RAW_ZONE_PATH}*.json"
    trusted_path = f"{TRUSTED_ZONE_PATH}/{current_date}/"

    
    df = spark.read.option("multiline", "true").schema(schema).json(raw_path)

    
    df_exploded = df.withColumn("genero_nome", explode(col("genero"))).drop("genero")

  
    df_formatted = df_exploded.withColumn("orcamento", format_number(col("orcamento"), 0)) \
                              .withColumn("arrecadacao", format_number(col("arrecadacao"), 0))

    
    column_order = ["id", "titulo", "ano_lancamento", "estudio", "diretores", "genero_nome", "orcamento", "arrecadacao"]
    df_final = df_formatted.select(*column_order)

   
    df_final.write.mode("overwrite").format("parquet").save(trusted_path)

process_data(schema)

job.commit()
