In [1]:
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

spark = SparkSession.builder.appName("ETL_02_Silver").getOrCreate()

# --- 3. Transformação (Transform) ---
schema = StructType([
    StructField("Id", IntegerType(), True),
    StructField("IndicatorCode", StringType(), True),
    StructField("SpatialDimType", StringType(), True),
    StructField("SpatialDim", StringType(), True),
    StructField("ParentLocationCode", StringType(), True),
    StructField("TimeDimType", StringType(), True),
    StructField("ParentLocation", StringType(), True),
    StructField("Dim1Type", StringType(), True),
    StructField("TimeDim", IntegerType(), True),
    StructField("Dim1", StringType(), True),
    StructField("Dim2Type", StringType(), True),
    StructField("Dim2", StringType(), True),
    StructField("Dim3Type", StringType(), True),
    StructField("Dim3", StringType(), True),
    StructField("DataSourceDimType", StringType(), True),
    StructField("DataSourceDim", StringType(), True),
    StructField("Value", StringType(), True),
    StructField("NumericValue", DoubleType(), True),
    StructField("Low", DoubleType(), True),
    StructField("High", DoubleType(), True),
    StructField("Comments", StringType(), True),
    StructField("Date", StringType(), True),
    StructField("TimeDimensionValue", StringType(), True),
    StructField("TimeDimensionBegin", StringType(), True),
    StructField("TimeDimensionEnd", StringType(), True)
])

# Função Adaptada para ler do HDFS Bronze
def processar_silver(nome_dataset):
    caminho_bronze = f"hdfs://namenode:9000/datalake/bronze/{nome_dataset}"
    
    # Lê o JSON da Bronze aplicando o Schema
    df = spark.read.schema(schema).json(caminho_bronze)
    
    # Aplicando transformações
    df_tratado = df.select(
        col("SpatialDim").alias("Pais"),
        col("TimeDim").alias("Ano"),
        col("Dim1").alias("Sexo"),
        col("NumericValue").alias("Valor")
    )
    return df_tratado

# Processamento
df_suicidio_b = processar_silver("suicidio_b")
df_suicidio_f = processar_silver("suicidio_f")
df_suicidio_m = processar_silver("suicidio_m")
df_depressao_b = processar_silver("depressao_b")
df_depressao_f = processar_silver("depressao_f")
df_depressao_m = processar_silver("depressao_m")

# --- 4. Carga (Load) ---
# O caminho no HDFS onde os dados serão salvos.
print(f"Salvando dados tratados no HDFS (Silver)...")

df_suicidio_b.write.mode("overwrite").parquet("hdfs://namenode:9000/datalake/silver/suicidio_b")
df_suicidio_f.write.mode("overwrite").parquet("hdfs://namenode:9000/datalake/silver/suicidio_f")
df_suicidio_m.write.mode("overwrite").parquet("hdfs://namenode:9000/datalake/silver/suicidio_m")

df_depressao_b.write.mode("overwrite").parquet("hdfs://namenode:9000/datalake/silver/depressao_b")
df_depressao_f.write.mode("overwrite").parquet("hdfs://namenode:9000/datalake/silver/depressao_f")
df_depressao_m.write.mode("overwrite").parquet("hdfs://namenode:9000/datalake/silver/depressao_m")

print("Dados Silver salvos com sucesso!")
spark.stop()

Salvando dados tratados no HDFS (Silver)...
Dados Silver salvos com sucesso!
