In [2]:
import socket
from pyspark.sql import SparkSession

spark_hostname = socket.gethostname()
spark_ip_address = socket.gethostbyname(spark_hostname)
minio_ip_address = spark_ip_address

## DEFINIDO O MESMO IP POIS OS SERVIÇOS VÃO EXECUTAR NO MESMO SERVIDOR
print(f"SPARK: {spark_hostname} - {spark_ip_address}")
print(f"MINIO: {minio_ip_address}")

extra_packages = "io.delta:delta-core_2.12:2.1.1,com.amazonaws:aws-java-sdk-bundle:1.12.469,org.apache.hadoop:hadoop-aws:3.3.4"


spark = (
    SparkSession.builder
    .config("spark.jars.packages", extra_packages)

    # CONFIGURA EXTENSÃO DO DELTA
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

    # LIMITA USO DE CORE E MEMORIA POR EXECUTOR
    .config("spark.cores.max", "1")
    .config("spark.executor.memory", "2g")

    # CONFIGURAÇÃO PARA COMUNICAR COM PROTOCOLO S3
    .config("spark.driver.bindAddress", f"{spark_hostname}")
    .config("spark.driver.host", f"{spark_ip_address}")
    .config("spark.hadoop.fs.s3a.access.key", "minio")
    .config("spark.hadoop.fs.s3a.secret.key", "minio123")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.endpoint", f"http://{minio_ip_address}:9000")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")

    # CRIA SESSÃO DO SPARK
    # .master(f"spark://{spark_ip_address}:7077")
    .appName("rascunho")
    .getOrCreate()
)

# hadoop_base_path = os.getenv("HADOOP_HOME").replace("\\", "/")
# hadoop_config = spark.sparkContext._jsc.hadoopConfiguration()
# hadoop_config.set("driver.extraClassPath", f"{hadoop_base_path}/lib/native/hadoop-aws-3.3.1.jar:{hadoop_base_path}/lib/native/aws-java-sdk-1.12.153")

# for key, value in [(k.replace("spark.hadoop.", ""), v) for k, v in spark.sparkContext.getConf().getAll() if k.find("hadoop") != -1]:
#     hadoop_config.set(key, value)

SPARK: note_rns - 192.168.18.118
MINIO: 192.168.18.118


In [3]:
estados = spark.read.csv(
    path="s3a://datalake/raw/municipios_brasileiros/estados.csv", 
    schema="codigo_uf LONG, uf STRING, nome STRING, latitude FLOAT, longitude FLOAT, regiao STRING",
    header=True
)

estados.printSchema()

root
 |-- codigo_uf: long (nullable = true)
 |-- uf: string (nullable = true)
 |-- nome: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- regiao: string (nullable = true)



In [None]:
estados.show(truncate=False)

In [4]:
municipios = spark.read.csv(
    path="s3a://datalake/raw/municipios_brasileiros/municipios.csv", 
    schema="codigo_ibge LONG, nome STRING, latitude FLOAT, longitude FLOAT, capital INT, codigo_uf LONG",
    # schema="codigo_ibge LONG, nome STRING, latitude FLOAT, longitude FLOAT, capital INT, codigo_uf LONG, siafi_id LONG, ddd LONG, fuso_horario STRING",
    header=True
)

municipios.printSchema()

root
 |-- codigo_ibge: long (nullable = true)
 |-- nome: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- capital: integer (nullable = true)
 |-- codigo_uf: long (nullable = true)



In [None]:
municipios.show(truncate=False)

In [12]:
df = (
    municipios.alias("m").join(
        other=estados.alias("e"),
        on=(municipios.codigo_uf == estados.codigo_uf),
        how="inner"
    )
    .selectExpr([
        "m.codigo_ibge",
        "m.nome",
        "m.latitude",
        "m.longitude",
        "(m.capital == 1) AS e_capital",
        "e.uf",
        "e.nome AS nome_estado",
        "e.regiao",
        "current_timestamp() AS gerado_em",
    ])
    .distinct()
)

df.show(truncate=False)

+-----------+-----------------------+--------+---------+---------+---+------------------+------------+-----------------------+
|codigo_ibge|nome                   |latitude|longitude|e_capital|uf |nome_estado       |regiao      |gerado_em              |
+-----------+-----------------------+--------+---------+---------+---+------------------+------------+-----------------------+
|3102902    |Antônio Carlos         |-21.321 |-43.7451 |false    |MG |Minas Gerais      |Sudeste     |2023-05-28 18:22:23.337|
|4301057    |Arroio do Sal          |-29.5439|-49.8895 |false    |RS |Rio Grande do Sul |Sul         |2023-05-28 18:22:23.337|
|4302220    |Boa Vista do Cadeado   |-28.5791|-53.8108 |false    |RS |Rio Grande do Sul |Sul         |2023-05-28 18:22:23.337|
|2201960    |Brasileira             |-4.1337 |-41.7859 |false    |PI |Piauí             |Nordeste    |2023-05-28 18:22:23.337|
|5203939    |Buriti de Goiás        |-16.1792|-50.4302 |false    |GO |Goiás             |Centro-Oeste|2023-05-2

In [14]:
# df.repartition(1).write.parquet("s3a://datalake/trusted/municipios_brasileiros")
(
    df.write
    .option("overwriteSchema", "true")
    .format("delta")
    .mode("overwrite")
    .save("s3a://datalake/trusted/municipios_brasileiros")
)

# SALVE REFINED

In [15]:
df = spark.read.load(
    path="s3a://datalake/trusted/municipios_brasileiros",
    format="delta"
)

(
    df.write
    .option("overwriteSchema", "true")
    .format("delta")
    .mode("overwrite")
    .save("s3a://datalake/refined/municipios_brasileiros")
)