Notebook appelé depuis un pipeline Azure Data Factory afin de charger les tables d'un répertoire de bronze vers silver.

Source: données à charger (QAD / GROUP_REFERENTIAL / DELTA_MASTER / PACKAGER)

Initialization: true / false, permet de faire un init / run du chargement. Un init efface le paramètre qui stocke le timestamp du dernier chargement réussi et la table destination dans silver, avant de faire le chargement.

Tables to import: liste des tables à charger.

Pipeline Name: Nom du pipeline exécutant le notebook.

Pipeline Run ID: ID du run du pipeline.

Data Factory Name: Nom de la ressource Data Factory exécutant le pipeline.

In [0]:
%run ./Utils/COMMON

In [0]:
import csv
import io
import os

dbutils.widgets.dropdown("source", "GROUP_REFERENTIAL", ["QAD", "GROUP_REFERENTIAL", "DELTA_MASTER", "PACKAGER","ORDER_MANAGEMENT"], "Source")
source: str = dbutils.widgets.get('source')

params_file= f"dbfs:/mnt/dlk/{CONFIG_CONTAINER}/{source}/params.csv"
clean_file: str = f"dbfs:/mnt/dlk/{CONFIG_CONTAINER}/{source}/clean.csv"

# Création de la liste de tables pour le widget
tables = csv.DictReader(io.StringIO(dbutils.fs.head(params_file)), delimiter=';')
tables_list = [table.get('output') for table in tables]
tables_list.append('all')

dbutils.widgets.text('init', 'false', 'Initialization')
dbutils.widgets.text('pipeline_run_id', '0', 'Pipeline Run ID')
dbutils.widgets.text('pipeline_name', '', 'Pipeline Name')
dbutils.widgets.text('data_factory_name', '', 'Data Factory Name')
dbutils.widgets.multiselect(
    name='tables_to_import',
    defaultValue='all',
    choices=tables_list,
    label='Tables to import (multi selection allowed)'
)

# Création d'un tableau des tables à importer
if any(',' in c for c in dbutils.widgets.get('tables_to_import')):
    tables_to_import = dbutils.widgets.get('tables_to_import').split(",")
else:
    tables_to_import = [dbutils.widgets.get('tables_to_import')]
    
init: bool = (dbutils.widgets.get('init') == 'true')
pipeline_run_id: str = dbutils.widgets.get('pipeline_run_id')
pipeline_name: str = dbutils.widgets.get('pipeline_name')
data_factory_name: str = dbutils.widgets.get('data_factory_name')
  
errors_count = 0

if init:
  # Créer la table de paramétrage.
  LIN_create_table_if_not_exists(os.getenv('STORAGE_ACCOUNT'), PARAMETERS_TABLE)
  print("init mode enabled")

In [0]:
from typing import List

# Lecture du fichier de configuration.
tables = csv.DictReader(io.StringIO(dbutils.fs.head(params_file)), delimiter=';')

params = []
for table in tables:
   
   folder_name = table.get('input')
   table_name = table.get('output')

   if (any(table_name.lower() == t.lower() for t in tables_to_import)) or (any('all' == t.lower() for t in tables_to_import)):

      # Lister les champs à nettoyer
      to_clean = csv.DictReader(io.StringIO(dbutils.fs.head(clean_file)), delimiter=';')
      fields_to_clean = []
      for field in filter(lambda d: d['table'].lower() == table_name.lower(), to_clean):
         fields_to_clean.append(field.get('field'))
    
      bronze_location: str = f"dbfs:/mnt/dlk/{BRONZE_CONTAINER}/{folder_name}/{table_name}/"
      silver_path: str = f"dbfs:/mnt/dlk/{SILVER_CONTAINER}/{folder_name}/{table_name}/"
      schema_path: str = f"dbfs:/mnt/dlk/{CONFIG_CONTAINER}/{folder_name}/TableSchemas/{table_name}.json"

      params.append({'input_file_path': bronze_location, #'schema_config_file_path': schema_path, 
                   'output_delta_table': silver_path, 
                   'schema_config_file_path': schema_path,
                   'pool_name': source, 
                   'init': init, 
                   'folder_partition': 'year=*/month=*/day=*/time=*', 
                   'pipeline_run_id': pipeline_run_id, 
                   'pipeline_name': pipeline_name, 
                   'data_factory_name': data_factory_name,
                   'delimiter': ';',
                   'header': 'true',
                   'encoding': 'UTF-8',
                   'multiLine': 'True',
                   'fields_to_clean': ','.join(fields_to_clean)
                  })
print("tables_to_import: ", tables_to_import)
print(params)
print(len(params))

In [0]:
import multiprocessing as mp
manager = mp.Manager()
pool = mp.pool.ThreadPool(9)

results = pool.map(lambda param: LIN_run_notebook('./Jobs/CSV_TO_DELTA', param), params)
errors_count = sum(results)

# Si un chargement est en erreur, on leve une exception pour que le notebook soit en erreur. 
if errors_count < 0:
  raise ValueError("Unable to load all files.")

In [0]:
print(errors_count)