In [0]:
!pip install openpyxl

In [0]:
import requests
from io import BytesIO
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, current_timestamp
import re

In [0]:
dbutils.widgets.text("token", "", "token")
token = dbutils.widgets.get("token")

In [0]:
# Spark session
spark = SparkSession.builder.getOrCreate()

In [0]:
%sql
drop table if exists workspace.projeto_final_bronze.bronze_panorama_eja

In [0]:
url = "https://api.github.com/repos/RafaelaSantos92/projeto-final/contents/raw-data/panorama-da-eja-no-brasil.xlsx?ref=main"

headers = {
    "Authorization": f"token {token}",
    "Accept": "application/vnd.github.v3.raw"
}

response = requests.get(url, headers=headers)
response.raise_for_status()

# Carregando apenas as linhas com dados
df = pd.read_excel(BytesIO(response.content), header=6)
df = df.iloc[:-1]

In [0]:
# Converte todas as colunas para string
df = df.astype(str)

# Função para limpar nomes de colunas
def clean_columns(cols):
    seen = {}
    new_cols = []
    for c in cols:
        # Substitui tudo que não for letra, número ou _ por _
        name = re.sub(r'[^0-9a-zA-Z_]', '_', c.strip())
        # Remove múltiplos _
        name = re.sub(r'_+', '_', name).strip('_').lower()
        # Garante unicidade
        if name in seen:
            seen[name] += 1
            name = f"{name}_{seen[name]}"
        else:
            seen[name] = 0
        new_cols.append(name)
    return new_cols

df.columns = clean_columns(df.columns)

# Cria Spark DataFrame
spark = SparkSession.builder.getOrCreate()
df_spark = spark.createDataFrame(df)

# Adiciona metadata
df_spark = df_spark.withColumn("source_file", lit("panorama-da-eja-no-brasil.xlsx")) \
                   .withColumn("ingestion_time", current_timestamp())

# Salva como tabela Delta no schema Bronze
df_spark.write.format("delta") \
       .mode("overwrite") \
       .saveAsTable("workspace.projeto_final_bronze.bronze_panorama_eja")


In [0]:
df_spark.display()