# ETL Silver → Gold
Popula o Star Schema no PostgreSQL a partir dos dados limpos do Silver Layer.

## 1. Imports e configurações

In [1]:
import psycopg2
from psycopg2.extras import execute_batch
import pandas as pd

DB_CONFIG = {
    "host": "localhost",
    "port": 5432,
    "database": "fake-real-job",
    "user": "postgres",
    "password": "senha123",
}

conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()

print("=" * 60)
print("DIAGNOSTICO PRE-ETL (SILVER_JOBS -> GOLD)")
print("=" * 60)

cur.execute("SELECT COUNT(*) FROM public.silver_jobs;")
total = cur.fetchone()[0]

cur.execute("SELECT COUNT(*) FROM public.silver_jobs WHERE salary_avg IS NOT NULL;")
com_salario = cur.fetchone()[0]

cur.execute("SELECT COUNT(*) FROM public.silver_jobs WHERE is_fake IS TRUE;")
fake = cur.fetchone()[0]

print(f"Total vagas: {total:,}")
print(f"Com salario: {com_salario:,}")
print(f"Vagas fake: {fake:,}")

print("=" * 60)


DIAGNOSTICO PRE-ETL (SILVER_JOBS -> GOLD)
Total vagas: 2,197
Com salario: 2,197
Vagas fake: 1,085


## 3. Carregamento da base de dados

In [2]:
df = pd.read_sql("SELECT * FROM public.silver_jobs", conn)
df.head()


  df = pd.read_sql("SELECT * FROM public.silver_jobs", conn)


Unnamed: 0,job_id,job_title,company_name,required_experience_years,is_fake,fraud_reason,posting_timestamp,application_deadline_timestamp,salary_min,salary_max,salary_avg,remote,country,state
0,1,Software Engineer,Company_543,8,False,,2023-11-24,2024-09-16,60000,80000,70000,False,Toronto,Canada
1,2,Content Writer,Company_192,10,False,,2023-03-03,2024-10-18,40000,60000,50000,False,Toronto,Canada
2,3,Customer Support Specialist,Anônimo,3,True,Suspicious email,2023-07-31,2024-01-13,60000,80000,70000,True,,
3,5,Graphic Designer,Anônimo,7,True,No salary info,2023-04-22,2024-08-26,60000,80000,70000,False,London,UK
4,6,Marketing Manager,Company_473,4,False,,2023-02-03,2024-07-24,40000,60000,50000,False,London,UK


## 3. Limpando Tabelas

In [3]:
cur.execute("""
    TRUNCATE TABLE
      gold.fact_job_posting,
      gold.dim_company,
      gold.dim_job_title,
      gold.dim_location,
      gold.dim_fraud_reason
    CASCADE;
""")
conn.commit()

print("Tabelas GOLD limpas")


Tabelas GOLD limpas


## 4. Dimensão Company (dim_company)

In [4]:
companies = (
    df[["company_name"]]
    .dropna()
    .drop_duplicates()
)

execute_batch(
    cur,
    "INSERT INTO gold.dim_company (company_name) VALUES (%s)",
    [(c.strip(),) for c in companies["company_name"] if str(c).strip()],
    page_size=5000
)
conn.commit()


## 5. Dimensão Job_title (dim_job_title)

In [5]:
titles = (
    df[["job_title"]]
    .dropna()
    .drop_duplicates()
)

execute_batch(
    cur,
    "INSERT INTO gold.dim_job_title (job_title) VALUES (%s)",
    [(t.strip(),) for t in titles["job_title"] if str(t).strip()],
    page_size=5000
)
conn.commit()


## 6. Dimensão Location (dim_location)

In [6]:
locations = df[["country", "state"]].drop_duplicates()

rows = []
for _, r in locations.iterrows():
    if pd.notna(r["country"]) or pd.notna(r["state"]):
        rows.append((r["country"], r["state"]))

execute_batch(
    cur,
    "INSERT INTO gold.dim_location (country, state) VALUES (%s, %s)",
    rows,
    page_size=5000
)
conn.commit()


## 7. Dimensão Fraud Reason (dim_fraud_Reason)

In [7]:
reasons = (
    df[["fraud_reason"]]
    .dropna()
    .drop_duplicates()
)

execute_batch(
    cur,
    "INSERT INTO gold.dim_fraud_reason (fraud_reason) VALUES (%s)",
    [(r.strip(),) for r in reasons["fraud_reason"] if str(r).strip()],
    page_size=5000
)
conn.commit()


## 8. Mapeamento das chaves surrogate das dimensões
Nesta etapa, as dimensões da camada Gold são carregadas em memória e
transformadas em dicionários de mapeamento, permitindo converter valores
descritivos (empresa, cargo, localização e motivo de fraude) em chaves
surrogate (_sk) que serão utilizadas como chaves estrangeiras na tabela fato.


In [8]:
cur.execute("SELECT company_name, company_sk FROM gold.dim_company")
company_map = dict(cur.fetchall())

cur.execute("SELECT job_title, job_title_sk FROM gold.dim_job_title")
title_map = dict(cur.fetchall())

cur.execute("SELECT country, state, location_sk FROM gold.dim_location")
location_map = {(c, s): k for c, s, k in cur.fetchall()}

cur.execute("SELECT fraud_reason, fraud_reason_sk FROM gold.dim_fraud_reason")
fraud_map = dict(cur.fetchall())


## 9. Carga da tabela fato (fact_job_posting)
Nesta etapa, os registros da camada Silver são transformados em linhas da
tabela fato da camada Gold, substituindo atributos textuais por chaves
surrogate (_srk) das dimensões, consolidando métricas e atributos analíticos
em um modelo estrela.


In [9]:
rows = []

for _, r in df.iterrows():
    rows.append((
        int(r.job_id),
        company_map.get(r.company_name),
        title_map.get(r.job_title),
        location_map.get((r.country, r.state)),
        fraud_map.get(r.fraud_reason),
        r.salary_avg,
        r.remote,
        r.is_fake
    ))

execute_batch(
    cur,
    """
    INSERT INTO gold.fact_job_posting (
        job_id,
        company_srk,
        job_title_srk,
        location_srk,
        fraud_reason_srk,
        salary_avg,
        remote,
        is_fake
    )
    VALUES (%s,%s,%s,%s,%s,%s,%s,%s)
    """,
    rows,
    page_size=5000
)

conn.commit()
print("FACT carregada com sucesso")


FACT carregada com sucesso


## 10. Validação da carga da camada Gold
Esta etapa realiza uma verificação quantitativa das tabelas da camada Gold
após a execução do ETL, garantindo que as dimensões e a tabela fato foram
populadas corretamente antes do consumo analítico.


In [10]:
print("\nGOLD (apos ETL):")
for t in [
    "gold.dim_company",
    "gold.dim_job_title",
    "gold.dim_location",
    "gold.dim_fraud_reason",
    "gold.fact_job_posting",
]:
    cur.execute(f"SELECT COUNT(*) FROM {t}")
    print(f"  {t}: {cur.fetchone()[0]:,}")



GOLD (apos ETL):
  gold.dim_company: 731
  gold.dim_job_title: 10
  gold.dim_location: 4
  gold.dim_fraud_reason: 3
  gold.fact_job_posting: 2,197


## 11. Encerramento do processo ETL
Nesta etapa, os recursos de banco de dados são liberados com o fechamento da
conexão e do cursor, indicando a conclusão bem-sucedida do processo ETL e
garantindo boas práticas de uso de recursos.


In [12]:
cur.close()
conn.close()
print("\nETL finalizado com sucesso!")


ETL finalizado com sucesso
