# Bronze para Silver

## Arquitetura Medallion - Camada Silver

Este script realiza a transformação dos dados da camada Bronze para a camada Silver.

### Operações realizadas:
- Limpeza e tratamento de valores nulos/inválidos
- Normalização de colunas e padronização de nomes
- Transformações e enriquecimento dos dados
- Criação do modelo relacional e joins

### Regras da camada Silver:
- Dados limpos e estruturados
- Chaves estrangeiras consistentes
- Valores normalizados e padronizados
- Pronta para análises relacionais

# Importação de bibliotecas

In [1]:
import os
import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime
import unicodedata

# Configuração de Diretórios

In [2]:
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
BRONZE_DIR = os.path.join(BASE_DIR, 'bronze')
SILVER_DIR = os.path.join(BASE_DIR, 'silver')

if not os.path.exists(SILVER_DIR):
    os.makedirs(SILVER_DIR)

# Carregamento dos dados da camada Bronze

In [3]:
bronze_db_path = os.path.join(BRONZE_DIR, 'bronze_layer.db')
conn = sqlite3.connect(bronze_db_path)
cursor = conn.cursor()

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = [row[0] for row in cursor.fetchall()]

print(f"Tabelas disponíveis na camada Bronze: {tables}")

conn.close()

Tabelas disponíveis na camada Bronze: ['olist_customers_dataset', 'olist_geolocation_dataset', 'olist_orders_dataset', 'olist_order_items_dataset', 'olist_order_payments_dataset', 'olist_order_reviews_dataset', 'olist_products_dataset', 'olist_sellers_dataset', 'product_category_name_translation']


# Carregar Tabelas Bronze

In [4]:
bronze_dfs = {}

for table in tables:
    pd_df = pd.read_sql_query(f"SELECT * FROM {table}", sqlite3.connect(bronze_db_path))
    bronze_dfs[table] = pd_df
    print(f"Carregada tabela {table}: {len(pd_df)} linhas, {len(pd_df.columns)} colunas")


Carregada tabela olist_customers_dataset: 99441 linhas, 8 colunas
Carregada tabela olist_geolocation_dataset: 1000163 linhas, 8 colunas
Carregada tabela olist_orders_dataset: 99441 linhas, 11 colunas
Carregada tabela olist_order_items_dataset: 112650 linhas, 10 colunas
Carregada tabela olist_order_payments_dataset: 103886 linhas, 8 colunas
Carregada tabela olist_order_reviews_dataset: 99224 linhas, 10 colunas
Carregada tabela olist_products_dataset: 32951 linhas, 12 colunas
Carregada tabela olist_sellers_dataset: 3095 linhas, 7 colunas
Carregada tabela product_category_name_translation: 71 linhas, 5 colunas


# Funções de tratamento para camada Silver

In [5]:
def clean_column_names(df):
    """Padroniza nomes de colunas para formato snake_case."""
    renamed_columns = {}
    for old_col in df.columns:
        new_col = old_col.lower().replace(' ', '_')
        renamed_columns[old_col] = new_col
    return df.rename(columns=renamed_columns)

def remove_metadados_bronze(df):
    """Remove colunas de metadados da camada bronze."""
    meta_cols = ['_ingestion_timestamp', '_source_file', '_source_system']
    meta_cols_to_drop = [col for col in meta_cols if col in df.columns]
    if meta_cols_to_drop:
        return df.drop(columns=meta_cols_to_drop)
    return df

def fix_datetime_columns(df, date_columns):
    """Converte colunas de data para formato datetime padrão."""
    for col in date_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
    return df

def normalize_text_columns(df, text_columns):
    """Normaliza colunas de texto: remove acentos, coloca em minúsculas."""
    result = df.copy()
    for col in text_columns:
        if col in result.columns:
            result[col] = result[col].apply(
                lambda x: unicodedata.normalize('NFKD', str(x) if pd.notna(x) else '')
                .encode('ASCII', 'ignore')
                .decode('ASCII')
                .lower() if pd.notna(x) else x
            )
    return result

def handle_missing_values(df, numeric_strategy='mean', categorical_strategy='unknown'):
    """Trata valores ausentes em colunas numéricas e categóricas."""
    result = df.copy()
    
    numeric_cols = result.select_dtypes(include=['number']).columns
    categorical_cols = result.select_dtypes(include=['object', 'category']).columns
    
    if numeric_strategy == 'mean':
        for col in numeric_cols:
            result[col] = result[col].fillna(result[col].mean())
    elif numeric_strategy == 'median':
        for col in numeric_cols:
            result[col] = result[col].fillna(result[col].median())
    elif numeric_strategy == 'zero':
        result[numeric_cols] = result[numeric_cols].fillna(0)
    
    result[categorical_cols] = result[categorical_cols].fillna(categorical_strategy)
    
    return result

# Processamento dos Datasets para Silver Layer

In [6]:
silver_dfs = {}

silver_db_path = os.path.join(SILVER_DIR, 'silver_layer.db')
silver_conn = sqlite3.connect(silver_db_path)

# 1. Processamento de Customers

In [7]:
if 'olist_customers_dataset' in bronze_dfs:
    customers_df = bronze_dfs['olist_customers_dataset']
    customers_df = clean_column_names(customers_df)
    customers_df = remove_metadados_bronze(customers_df)
    customers_df = normalize_text_columns(customers_df, ['customer_city', 'customer_state'])
    
    silver_dfs['customers'] = customers_df
    customers_df.to_sql('customers', silver_conn, if_exists='replace', index=False)
    print(f"Processada tabela customers: {len(customers_df)} linhas")

Processada tabela customers: 99441 linhas


# 2. Processamento de Orders

In [8]:
if 'olist_orders_dataset' in bronze_dfs:
    orders_df = bronze_dfs['olist_orders_dataset']
    orders_df = clean_column_names(orders_df)
    orders_df = remove_metadados_bronze(orders_df)
    
    date_columns = [col for col in orders_df.columns if 'date' in col]
    orders_df = fix_datetime_columns(orders_df, date_columns)
    
    silver_dfs['orders'] = orders_df
    orders_df.to_sql('orders', silver_conn, if_exists='replace', index=False)
    print(f"Processada tabela orders: {len(orders_df)} linhas")

Processada tabela orders: 99441 linhas


# 3. Processamento de Products

In [9]:
if 'olist_products_dataset' in bronze_dfs:
    products_df = bronze_dfs['olist_products_dataset']
    products_df = clean_column_names(products_df)
    products_df = remove_metadados_bronze(products_df)
    
    if 'product_category_name_translation' in bronze_dfs:
        translation_df = bronze_dfs['product_category_name_translation']
        translation_df = clean_column_names(translation_df)
        translation_df = remove_metadados_bronze(translation_df)
        
        translation_dict = dict(zip(
            translation_df['product_category_name'],
            translation_df['product_category_name_english']
        ))
        
        products_df['product_category_name_english'] = products_df['product_category_name'].map(translation_dict)
    
    products_df = handle_missing_values(
        products_df,
        numeric_strategy='median',
        categorical_strategy='unknown'
    )
    
    silver_dfs['products'] = products_df
    products_df.to_sql('products', silver_conn, if_exists='replace', index=False)
    print(f"Processada tabela products: {len(products_df)} linhas")

Processada tabela products: 32951 linhas


# 4. Processamento de Order Items

In [10]:
if 'olist_order_items_dataset' in bronze_dfs:
    order_items_df = bronze_dfs['olist_order_items_dataset']
    order_items_df = clean_column_names(order_items_df)
    order_items_df = remove_metadados_bronze(order_items_df)
    
    date_columns = [col for col in order_items_df.columns if 'date' in col]
    order_items_df = fix_datetime_columns(order_items_df, date_columns)
    
    silver_dfs['order_items'] = order_items_df
    order_items_df.to_sql('order_items', silver_conn, if_exists='replace', index=False)
    print(f"Processada tabela order_items: {len(order_items_df)} linhas")

Processada tabela order_items: 112650 linhas


# 5. Processamento de Order Payments

In [11]:
if 'olist_order_payments_dataset' in bronze_dfs:
    payments_df = bronze_dfs['olist_order_payments_dataset']
    payments_df = clean_column_names(payments_df)
    payments_df = remove_metadados_bronze(payments_df)
    
    silver_dfs['order_payments'] = payments_df
    payments_df.to_sql('order_payments', silver_conn, if_exists='replace', index=False)
    print(f"Processada tabela order_payments: {len(payments_df)} linhas")

Processada tabela order_payments: 103886 linhas


# 6. Processamento de Order Reviews

In [12]:
if 'olist_order_reviews_dataset' in bronze_dfs:
    reviews_df = bronze_dfs['olist_order_reviews_dataset']
    reviews_df = clean_column_names(reviews_df)
    reviews_df = remove_metadados_bronze(reviews_df)
    
    date_columns = [col for col in reviews_df.columns if 'date' in col]
    reviews_df = fix_datetime_columns(reviews_df, date_columns)
    
    if 'review_comment_message' in reviews_df.columns:
        reviews_df = normalize_text_columns(reviews_df, ['review_comment_message'])
    
    silver_dfs['order_reviews'] = reviews_df
    reviews_df.to_sql('order_reviews', silver_conn, if_exists='replace', index=False)
    print(f"Processada tabela order_reviews: {len(reviews_df)} linhas")

Processada tabela order_reviews: 99224 linhas


# 7. Processamento de Sellers

In [13]:
if 'olist_sellers_dataset' in bronze_dfs:
    sellers_df = bronze_dfs['olist_sellers_dataset']
    sellers_df = clean_column_names(sellers_df)
    sellers_df = remove_metadados_bronze(sellers_df)
    sellers_df = normalize_text_columns(sellers_df, ['seller_city', 'seller_state'])
    
    silver_dfs['sellers'] = sellers_df
    sellers_df.to_sql('sellers', silver_conn, if_exists='replace', index=False)
    print(f"Processada tabela sellers: {len(sellers_df)} linhas")

Processada tabela sellers: 3095 linhas


# 8. Processamento de Geolocation

In [14]:
if 'olist_geolocation_dataset' in bronze_dfs:
    geo_df = bronze_dfs['olist_geolocation_dataset']
    geo_df = clean_column_names(geo_df)
    geo_df = remove_metadados_bronze(geo_df)
    geo_df = normalize_text_columns(geo_df, ['geolocation_city', 'geolocation_state'])
    
    geo_grouped = geo_df.groupby('geolocation_zip_code_prefix').agg({
        'geolocation_lat': 'mean',
        'geolocation_lng': 'mean',
        'geolocation_city': lambda x: x.mode()[0] if not x.mode().empty else None,
        'geolocation_state': lambda x: x.mode()[0] if not x.mode().empty else None
    }).reset_index()
    
    silver_dfs['geolocation'] = geo_grouped
    geo_grouped.to_sql('geolocation', silver_conn, if_exists='replace', index=False)
    print(f"Processada tabela geolocation: {len(geo_grouped)} linhas (agrupada por CEP)")

silver_conn.close()

print("\nProcessamento para camada Silver concluído!")
print(f"Total de tabelas processadas: {len(silver_dfs)}")

Processada tabela geolocation: 19015 linhas (agrupada por CEP)

Processamento para camada Silver concluído!
Total de tabelas processadas: 8


# Salvar metadados da camada Silver

In [15]:
silver_metadata = {
    'tables': list(silver_dfs.keys()),
    'row_counts': {table: len(df) for table, df in silver_dfs.items()},
    'processing_date': datetime.now().isoformat()
}

with open(os.path.join(SILVER_DIR, 'silver_metadata.json'), 'w') as f:
    import json
    json.dump(silver_metadata, f, indent=2)

print("Metadados salvos com sucesso!")

Metadados salvos com sucesso!
