In [1]:
from tqdm import tqdm
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, round, date_format, to_date, create_map, lit, when, lower, regexp_replace, substring, concat_ws
from itertools import chain
import os
from dotenv import load_dotenv
load_dotenv()

from datetime import datetime
def log(msg):
    tqdm.write(f"[{datetime.now():%Y-%m-%d %H:%M:%S}] {msg}")

In [2]:
# 📌 Bibliotecas Padrão do Python
import logging
import warnings
import itertools
from datetime import datetime
from typing import Literal

# 📌 Manipulação de Dados
import pandas as pd
import numpy as np

# 📌 Visualização
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from datetime import datetime



# 📌 Barras de Progresso
from tqdm import tqdm

# 📌 Configurações Globais para Melhor Exibição dos Dados
warnings.simplefilter(action="ignore", category=UserWarning)  # Ignorar avisos gerais do usuário
warnings.simplefilter(action="ignore", category=FutureWarning)  # Ignorar avisos de futuras mudanças

# Exibição de ponto flutuante sem notação científica
pd.options.display.float_format = "{:.2f}".format
# Configuração do número máximo de colunas e linhas exibidas
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 65)

# Configuração do backend de gráficos
pd.options.plotting.backend = "plotly"
pd.options.display.colheader_justify = "center"

In [5]:
from pyspark.sql import SparkSession

raw_warehouse_path = "/home/pedromurta/projects/observatorio/caged/data/observatorio_caged/raw/"
bronze_warehouse_path = "/home/pedromurta/projects/observatorio/caged/data/observatorio_caged/bronze/"

spark = SparkSession.builder \
    .appName("Salvar CAGED Iceberg") \
    .config("spark.sql.catalog.raw", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.raw.type", "hadoop") \
    .config("spark.sql.catalog.raw.warehouse", raw_warehouse_path) \
    .config("spark.sql.catalog.bronze", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.bronze.type", "hadoop") \
    .config("spark.sql.catalog.bronze.warehouse", bronze_warehouse_path) \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.memory.fraction", "0.6") \
    .config("spark.memory.storageFraction", "0.3") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memoryOverhead", "2g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .getOrCreate()


25/05/12 11:07:42 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [9]:
# Leitura dos Parquets
df_mov = spark.read.parquet(f"{raw_warehouse_path}/parquet/mov/*")
df_for = spark.read.parquet(f"{raw_warehouse_path}/parquet/for/*")

In [10]:
df_mov.printSchema()

root
 |-- competencia: integer (nullable = true)
 |-- uf: integer (nullable = true)
 |-- municipio: integer (nullable = true)
 |-- secao: string (nullable = true)
 |-- sub_classe: integer (nullable = true)
 |-- saldo_movimentacao: integer (nullable = true)
 |-- codigo_cbo: integer (nullable = true)
 |-- escolaridade: integer (nullable = true)
 |-- idade: integer (nullable = true)
 |-- etnia: integer (nullable = true)
 |-- sexo: integer (nullable = true)
 |-- salario: double (nullable = true)
 |-- salario_fixo: double (nullable = true)



In [11]:
df_for.printSchema()

root
 |-- competencia: integer (nullable = true)
 |-- uf: integer (nullable = true)
 |-- municipio: integer (nullable = true)
 |-- secao: string (nullable = true)
 |-- sub_classe: integer (nullable = true)
 |-- saldo_movimentacao: integer (nullable = true)
 |-- codigo_cbo: integer (nullable = true)
 |-- escolaridade: integer (nullable = true)
 |-- idade: integer (nullable = true)
 |-- etnia: integer (nullable = true)
 |-- sexo: integer (nullable = true)
 |-- salario: double (nullable = true)
 |-- salario_fixo: double (nullable = true)



In [12]:
# União
df = df_mov.unionByName(df_for)

In [13]:
df.show(5, truncate=False)

+-----------+---+---------+-----+----------+------------------+----------+------------+-----+-----+----+-------+------------+
|competencia|uf |municipio|secao|sub_classe|saldo_movimentacao|codigo_cbo|escolaridade|idade|etnia|sexo|salario|salario_fixo|
+-----------+---+---------+-----+----------+------------------+----------+------------+-----+-----+----+-------+------------+
|202502     |41 |410840   |M    |7112000   |-1                |715315    |7           |33   |1    |1   |2752.2 |2752.2      |
|202502     |29 |290320   |G    |4781400   |1                 |521110    |7           |22   |3    |3   |1518.0 |1518.0      |
|202502     |26 |261160   |G    |4781400   |1                 |763015    |7           |55   |1    |3   |2500.0 |2500.0      |
|202502     |33 |330455   |I    |5620102   |1                 |513405    |7           |46   |3    |1   |0.0    |0.0         |
|202502     |35 |350920   |M    |7119701   |1                 |312320    |7           |29   |3    |1   |1800.0 |1800.0

In [14]:
print(f'mov = {df_mov.count()}' )
print('----------------')
print(f'for = {df_for.count()}' )
print('----------------')
print(f'total = {df.count()}' )

                                                                                

mov = 215005373
----------------
for = 7726496
----------------
total = 222731869


                                                                                

In [21]:
# Escreve df_mov na camada RAW
catalog_name = "raw"
schema_name = "default"
table_name_mov = "caged_mov"

spark.sql("DROP TABLE IF EXISTS raw.default.caged_mov")

DataFrame[]

In [22]:
# Escreve df_mov na camada RAW
catalog_name = "raw"
schema_name = "default"
table_name_mov = "caged_mov"

df_mov.writeTo(f"{catalog_name}.{schema_name}.{table_name_mov}") \
    .partitionedBy("competencia") \
    .create()


                                                                                

In [23]:
# Escreve df_for na camada RAW
table_name_for = "caged_for"

df_for.writeTo(f"{catalog_name}.{schema_name}.{table_name_for}") \
    .partitionedBy("competencia") \
    .create()


                                                                                

In [24]:
# Escreve df_union na camada BRONZE
catalog_bronze = "bronze"
schema_name_bronze = "default"
table_name_bronze = "caged"

df.writeTo(f"{catalog_bronze}.{schema_name_bronze}.{table_name_bronze}") \
    .partitionedBy("competencia") \
    .create()

                                                                                

In [25]:
spark.stop()