In [5]:
# ╔══════════════════════════════════════════════════════════════════╗
# ║ 00_CAPTURE_DRIVE — Google Drive → MinIO (bucket raw/<projeto>)  ║
# ╠══════════════════════════════════════════════════════════════════╣
# ║ • Cópia recursiva (stream 10 MB)                                ║
# ║ • Deduplicação nome + SHA-256 ⇒ raw_unicos/                     ║
# ║ • Registro em PostgreSQL (tabela arquivos_raw)                  ║
# ║ • Log detalhado em logs/drive2minio_<data>.log                  ║
# ╚══════════════════════════════════════════════════════════════════╝

from pathlib import PurePosixPath, Path
import io, os, hashlib, logging, datetime
from tqdm.auto import tqdm

# ─── Entradas ───────────────────────────────────────────────────────
PROJETO   = input("📝 Nome do projeto: ").strip()
FOLDER_ID = input("🔑 ID da pasta-raiz no Google Drive: ").strip()

# ─── Variável de ambiente para WIF ──────────────────────────────────
os.environ["GOOGLE_EXTERNAL_ACCOUNT_ALLOW_EXECUTABLES"] = "1"

# ─── Config Google Drive (Workload Identity Federation) ────────────
from google.auth import load_credentials_from_file
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload

SCOPES   = ["https://www.googleapis.com/auth/drive.readonly"]
CREDS_FN = "wif-credentials.json"
creds, _ = load_credentials_from_file(CREDS_FN, scopes=SCOPES)
drive    = build("drive", "v3", credentials=creds, cache_discovery=False)

# ─── Config MinIO ───────────────────────────────────────────────────
from minio import Minio
MINIO   = Minio("minio:9000", access_key="admin", secret_key="senhasegura", secure=False)
BUCKET  = "raw"
if not MINIO.bucket_exists(BUCKET):
    MINIO.make_bucket(BUCKET)

RAW_PREFIX      = f"{PROJETO}/"
RAW_UNICOS_PREF = "raw_unicos/"

# ─── Config PostgreSQL ──────────────────────────────────────────────
import psycopg2, psycopg2.extras
PG_CONN = psycopg2.connect(
    host="postgres_db", port=5432, dbname="postgres",
    user="postgres", password="senhasegura"
)
PG_CONN.autocommit = True
with PG_CONN.cursor() as cur:
    cur.execute("""
        CREATE TABLE IF NOT EXISTS arquivos_raw(
            id SERIAL PRIMARY KEY,
            projeto TEXT NOT NULL,
            caminho TEXT NOT NULL,
            nome TEXT NOT NULL,
            hash_sha256 CHAR(64) NOT NULL,
            tamanho BIGINT NOT NULL,
            criado_em TIMESTAMPTZ DEFAULT NOW(),
            UNIQUE (projeto, caminho, nome)
        );
    """)

# ─── Config Log ─────────────────────────────────────────────────────
Path("logs").mkdir(exist_ok=True)
log_fn = f"logs/drive2minio_{datetime.datetime.now():%Y%m%d_%H%M%S}.log"
logging.basicConfig(filename=log_fn, level=logging.INFO,
                    format="%(asctime)s | %(levelname)s | %(message)s")
logging.info(f"START capture project={PROJETO} folder_id={FOLDER_ID}")

# ─── Funções auxiliares ─────────────────────────────────────────────
def walk_drive(folder_id, base=""):
    """Yield (file_id, rel_path) recursively."""
    page_token = None
    while True:
        resp = drive.files().list(
            q=f"'{folder_id}' in parents and trashed=false",
            fields="nextPageToken, files(id,name,mimeType)",
            pageToken=page_token
        ).execute()
        for f in resp.get("files", []):
            rel = PurePosixPath(base) / f["name"]
            if f["mimeType"] == "application/vnd.google-apps.folder":
                yield from walk_drive(f["id"], rel.as_posix())
            else:
                yield f["id"], rel.as_posix()
        page_token = resp.get("nextPageToken")
        if page_token is None:
            break

# ─── Transferência ──────────────────────────────────────────────────
try:
    files = list(walk_drive(FOLDER_ID))
    for file_id, rel_path in tqdm(files, desc="📤 Transferindo"):
        obj_raw = RAW_PREFIX + rel_path

        # --- download stream 10 MB ---
        buf = io.BytesIO()
        downloader = MediaIoBaseDownload(buf, drive.files().get_media(fileId=file_id),
                                         chunksize=10 * 1024 * 1024)
        done = False
        while not done:
            _, done = downloader.next_chunk()
        buf.seek(0)
        data       = buf.read()
        tamanho    = len(data)
        hash_sha   = hashlib.sha256(data).hexdigest()
        file_name  = PurePosixPath(rel_path).name
        caminho    = str(PurePosixPath(rel_path).parent)

        # --- upload raw/<projeto>/… ---
        MINIO.put_object(BUCKET, obj_raw, io.BytesIO(data), tamanho)

        # --- deduplicação para raw_unicos/ ---
        with PG_CONN.cursor() as cur:
            cur.execute("SELECT 1 FROM arquivos_raw WHERE hash_sha256=%s LIMIT 1", (hash_sha,))
            if cur.fetchone() is None:
                obj_unico = f"{RAW_UNICOS_PREF}{hash_sha}_{file_name}"
                MINIO.put_object(BUCKET, obj_unico, io.BytesIO(data), tamanho)

        # --- registrar metadados ---
        with PG_CONN.cursor() as cur:
            psycopg2.extras.execute_values(
                cur,
                """INSERT INTO arquivos_raw (projeto,caminho,nome,hash_sha256,tamanho)
                   VALUES %s ON CONFLICT DO NOTHING""",
                [(PROJETO, caminho, file_name, hash_sha, tamanho)]
            )

        logging.info(f"OK {rel_path} size={tamanho} hash={hash_sha}")

    print("✅ Transferência concluída sem erros.")
    logging.info("END success")

except Exception as e:
    logging.exception("ABORTED due to error")
    raise  # aborta na primeira falha


DefaultCredentialsError: ('File wif-credentials.json is not a valid json file.', JSONDecodeError("Expecting ',' delimiter: line 13 column 56 (char 626)"))