# Imports

In [None]:
import os
import logging
import requests
import zipfile
from io import BytesIO
from azure.storage.filedatalake import DataLakeServiceClient

# Configuration du logging

In [None]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger(__name__)

# Config

In [None]:
STORAGE_ACCOUNT_NAME = os.environ["STORAGE_ACCOUNT_NAME"]
FILESYSTEM_NAME = os.environ["CONTAINER_NAME"]
SECRET_SCOPE_NAME = "datalake-scope"
SECRET_KEY_NAME = "storage-account-key"

In [None]:
ZIP_URLS = {
    "dis-2025-dept": "https://www.data.gouv.fr/api/1/datasets/r/6994a9f1-3f4b-4e15-a4dc-0e358a6aac13",
    "dis-2024-dept": "https://www.data.gouv.fr/api/1/datasets/r/c0350599-a041-4724-9942-ad4c2ba9a7b3",
    "dis-2023-dept": "https://www.data.gouv.fr/api/1/datasets/r/96452cf0-329a-4908-8adb-8f061adcca4c",
    "dis-2022-dept": "https://www.data.gouv.fr/api/1/datasets/r/77d3151a-739e-4aab-8c34-7a15d7fea55d",
    "dis-2021-dept": "https://www.data.gouv.fr/api/1/datasets/r/3c5ebbd9-f6b5-4837-a194-12bfeda7f38e",

}

# Fonctions utilitaires

In [None]:
def get_adls_service_client(storage_account, secret_scope, secret_key):
    """
    Crée et retourne un DataLakeServiceClient connecté au stockage ADLS Gen2.
    """
    storage_key = dbutils.secrets.get(scope=secret_scope, key=secret_key)
    conn_str = f"DefaultEndpointsProtocol=https;AccountName={storage_account};AccountKey={storage_key};EndpointSuffix=core.windows.net"
    return DataLakeServiceClient.from_connection_string(conn_str)

In [None]:
def upload_file_to_adls(filesystem_client, local_path, target_path):
    """
    Upload un fichier local vers ADLS Gen2 à l'emplacement target_path.
    """
    file_client = filesystem_client.get_file_client(target_path)
    with open(local_path, "rb") as f:
        file_client.upload_data(f, overwrite=True)
    logger.info(f"Uploaded {target_path} to ADLS")

In [None]:
def process_zip_from_url(url, folder_name, filesystem_client):
    """
    Télécharge un ZIP depuis l'URL, le décompresse, et upload chaque fichier
    dans un sous-dossier correspondant à folder_name dans ADLS.
    """
    logger.info(f"Downloading {url}...")
    response = requests.get(url)
    response.raise_for_status()

    with zipfile.ZipFile(BytesIO(response.content)) as zf:
        for file_name in zf.namelist():
            logger.info(f"Processing {file_name}...")
            local_path = f"/tmp/{file_name}"
            os.makedirs(os.path.dirname(local_path), exist_ok=True)
            with open(local_path, "wb") as f:
                f.write(zf.read(file_name))

            # Upload vers ADLS sous un sous-dossier folder_name
            target_path = f"{folder_name}/{file_name}"
            try:
                upload_file_to_adls(filesystem_client, local_path, target_path)
            except Exception as e:
                logger.error(f"Failed to upload {file_name}: {e}")

# Lance le processus

In [None]:
# Création du client ADLS
filesystem_client = get_adls_service_client(STORAGE_ACCOUNT_NAME, SECRET_SCOPE_NAME, SECRET_KEY_NAME)\
                        .get_file_system_client(FILESYSTEM_NAME)

# Traitement de chaque ZIP
for folder_name, url in ZIP_URLS.items():
    try:
        process_zip_from_url(url, folder_name, filesystem_client)
    except Exception as e:
        logger.error(f"Failed to process {url}: {e}")