In [0]:
import requests
from io import BytesIO
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, current_timestamp
import re

In [0]:
dbutils.widgets.text("token", "", "token")
token = dbutils.widgets.get("token")

In [0]:
# Spark session
spark = SparkSession.builder.getOrCreate()

In [0]:
%sql
drop table if exists workspace.projeto_final_bronze.bronze_ibge_censo_2022

In [0]:
# URL do CSV
url_raw = "https://raw.githubusercontent.com/RafaelaSantos92/projeto-final/2d91c9a99aae72fa73f779d77e2f49fbf6a85c5d/raw-data/Populacao%20-%20Censo%202022.csv"

# Token
headers = {'Authorization': f'token {token}'} if token else None

# request
response = requests.get(url_raw, headers=headers)
response.raise_for_status()

# Carrega CSV em pandas
pdf = pd.read_csv(BytesIO(response.content))

# Normaliza os nomes das colunas para Delta
pdf.columns = [
    c.strip()
     .lower()
     .replace(" ", "_")
     .replace("-", "_")
     .replace(",", "")
     .replace("(", "")
     .replace(")", "")
     .replace(".", "")
     .encode('ascii', errors='ignore').decode()
    for c in pdf.columns
]

# Converte para Spark DataFrame
df = spark.createDataFrame(pdf)

# Adiciona colunas da camada Bronze
df = df.withColumn("source_file", lit(url_raw)) \
       .withColumn("ingestion_time", current_timestamp())

# Mostra 5 linhas para conferir
df.show(5)

# Salva como tabela gerenciada no schema Bronze
df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("workspace.projeto_final_bronze.bronze_ibge_censo_2022")
