In [1]:
import socket, os
from pyspark.sql import SparkSession


spark_hostname = socket.gethostname()
spark_ip_address = socket.gethostbyname(spark_hostname)
minio_ip_address = spark_ip_address

## DEFINIDO O MESMO IP POIS OS SERVIÇOS VÃO EXECUTAR NO MESMO SERVIDOR
print(f"SPARK: {spark_hostname} - {spark_ip_address}")
print(f"MINIO: {minio_ip_address}")

spark = (
    SparkSession.builder
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.1.1,com.amazonaws:aws-java-sdk-bundle:1.12.469,org.apache.hadoop:hadoop-aws:3.3.4")

    # CONFIGURA EXTENSÃO DO DELTA
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

    # LIMITA USO DE CORE E MEMORIA POR EXECUTOR
    .config("spark.cores.max", "1")
    .config("spark.executor.memory", "2g")

    # CONFIGURAÇÃO PARA COMUNICAR COM PROTOCOLO S3
    .config("spark.driver.bindAddress", f"{spark_hostname}")
    .config("spark.driver.host", f"{spark_ip_address}")
    .config("spark.hadoop.fs.s3a.access.key", "minio")
    .config("spark.hadoop.fs.s3a.secret.key", "minio123")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.endpoint", f"http://{minio_ip_address}:9000")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")

    # CRIA SESSÃO DO SPARK
    .master(f"spark://{spark_ip_address}:7077")
    .appName("rascunho")
    .getOrCreate()
)

# hadoop_base_path = os.getenv("HADOOP_HOME").replace("\\", "/")
# hadoop_config = spark.sparkContext._jsc.hadoopConfiguration()
# hadoop_config.set("driver.extraClassPath", f"{hadoop_base_path}/lib/native/hadoop-aws-3.3.1.jar:{hadoop_base_path}/lib/native/aws-java-sdk-1.12.153")

# for key, value in [(k.replace("spark.hadoop.", ""), v) for k, v in spark.sparkContext.getConf().getAll() if k.find("hadoop") != -1]:
#     hadoop_config.set(key, value)

SPARK: note_rns - 192.168.18.118
MINIO: 192.168.18.118


In [2]:
df = spark.read.load(
    path="s3a://datalake/refined/municipios_brasileiros",
    format="delta"
)

df.printSchema()

root
 |-- codigo_ibge: long (nullable = true)
 |-- nome: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- e_capital: boolean (nullable = true)
 |-- uf: string (nullable = true)
 |-- nome_estado: string (nullable = true)
 |-- regiao: string (nullable = true)
 |-- gerado_em: timestamp (nullable = true)



In [4]:
df.show(truncate=False)

+-----------+--------------------+--------+---------+---------+---+------------------+------------+-----------------------+
|codigo_ibge|nome                |latitude|longitude|e_capital|uf |nome_estado       |regiao      |gerado_em              |
+-----------+--------------------+--------+---------+---------+---+------------------+------------+-----------------------+
|3501509    |Alvinlândia         |-22.4435|-49.7623 |false    |SP |São Paulo         |Sudeste     |2023-05-31 21:59:40.099|
|4300703    |Anta Gorda          |-28.9698|-52.0102 |false    |RS |Rio Grande do Sul |Sul         |2023-05-31 21:59:40.099|
|3300407    |Barra Mansa         |-22.5481|-44.1752 |false    |RJ |Rio de Janeiro    |Sudeste     |2023-05-31 21:59:40.099|
|4202800    |Braço do Norte      |-28.2681|-49.1701 |false    |SC |Santa Catarina    |Sul         |2023-05-31 21:59:40.099|
|5203939    |Buriti de Goiás     |-16.1792|-50.4302 |false    |GO |Goiás             |Centro-Oeste|2023-05-31 21:59:40.099|
|4302907

In [3]:
df.groupBy("regiao").count().show(truncate=False)

+------------+-----+
|regiao      |count|
+------------+-----+
|Nordeste    |1794 |
|Sul         |1191 |
|Sudeste     |1668 |
|Centro-Oeste|467  |
|Norte       |450  |
+------------+-----+



In [8]:
import json
import boto3 
import requests
from uuid import uuid4
from datetime import datetime

s3 = boto3.resource(
    's3', 
    aws_access_key_id="minio",
    aws_secret_access_key="minio123",
    endpoint_url=f"http://{minio_ip_address}:9000", 
    use_ssl=False
)

for row in df.filter("NOT e_capital").orderBy("regiao", "codigo_ibge").collect():
    resp = requests.get(f"https://archive-api.open-meteo.com/v1/era5?latitude={row.latitude}&longitude={row.longitude}&start_date=2000-01-01&end_date=2022-12-31&hourly=temperature_2m,relativehumidity_2m,windspeed_10m&timezone=America/Sao_Paulo")
    resp.raise_for_status()

    # (
    #     s3.Object('datalake', f'raw/clima_municipios_brasileiros/2000_2022/{row.regiao}/{datetime.today().strftime("%Y%m%d%H%M%S")}_{uuid4()}.json')
    #     .put(
    #         Body=(bytes(json.dumps({ "responses": resp.json() }).encode('UTF-8')))
    #     )
    # )

    with open(f'C:/Users/Ronildo/Downloads/clima_municipios_brasileiros/processados/{datetime.today().strftime("%Y%m%d%H%M%S")}_{uuid4()}.json', "w") as file:
        json.dump({ "codigo_ibge": row.codigo_ibge }, file)

    with open(f'C:/Users/Ronildo/Downloads/clima_municipios_brasileiros/2000_2022/{row.regiao}/{row.codigo_ibge}_{datetime.today().strftime("%Y%m%d%H%M%S")}_{uuid4()}.json', "w") as outfile:
        json.dump(resp.json(), outfile)

    print("salvo")

salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salvo
salv

In [10]:
import gzip
import glob

files = glob.glob("C:/Users/Ronildo/Downloads/clima_municipios_brasileiros/2000_2022/Nordeste/*")

for file in files:
    with open(file, 'rb') as f_in, \
        gzip.open(f'{file}.gz', 'wb') as f_out:
        f_out.writelines(f_in)