Notebook permettant de charger un fichier parquet dans une table delta.

Storage Account Name: compte de stockage contenant la table de paramétrage.

Initialization: true / false, permet de faire un init / run du chargement. Un init efface le paramètre qui stocke le timestamp du dernier chargement réussi et la table destination dans silver, avant de faire le chargement.

Input Path: chemin abfss du fichier source.

Output Path: chemin abfss de la table delta destination.

Pool: nom du pool utilisé par spark. Les pools sont définis au niveau du cluster dans Advanced Options -> Spark Config (= spark.scheduler.allocation.file /dbfs/FileStore/fairscheduler.xml).

CSV Delimiter: séparateur du fichier csv.

CSV Header: indique si le fichier csv a un en tête.

Input File Encoding: format d'encodage du fichier csv.

Input File Multiline: indique si le fichier csv est multiligne.

Pipeline Name: Nom du pipeline exécutant le notebook.

Pipeline Run ID: ID du run du pipeline.

Data Factory Name: Nom de la ressource Data Factory exécutant le pipeline. 

Fields To Clean: Liste des champs à nettoyer.

In [0]:
%run ../Utils/COMMON

In [0]:
from datetime import datetime
import os

dbutils.widgets.text('input_file_path','dbfs:/mnt/dlk/bronze/QAD/bom_mstr/','Input Path')
dbutils.widgets.text('output_delta_table','dbfs:/mnt/dlk/silver/QAD/bom_mstr/', 'Output Path')
dbutils.widgets.text('schema_config_file_path', 'dbfs:/mnt/dlk/config-files/QAD/TableSchemas/bom_mstr.json', 'Schema Config File Path')
dbutils.widgets.text('pool_name', 'byod', 'Pool')
dbutils.widgets.text('init', 'false', 'Initialization')
dbutils.widgets.text('pipeline_run_id', '0', 'Pipeline Run ID')
dbutils.widgets.text('pipeline_name', '', 'Pipeline Name')
dbutils.widgets.text('data_factory_name', '', 'Data Factory Name')
dbutils.widgets.text('folder_partition', 'year=*/month=*/day=*/time=*', 'Folder Structure')
dbutils.widgets.dropdown("delimiter", ";", [",", ";"], "CSV Delimiter")
dbutils.widgets.dropdown("header", "true", ["false", "true"], "CSV Header")
dbutils.widgets.dropdown("encoding", "UTF-8", ["UTF-8", "ISO-8859-1"], "Input File Encoding")
dbutils.widgets.dropdown("multiLine", "True", ["False", "True"], "Input File Multiline")
dbutils.widgets.text('fields_to_clean', '', 'Fields To Clean')

start_date = datetime.utcnow()

init: bool = (dbutils.widgets.get('init') == 'true')
folder_partition: str = dbutils.widgets.get('folder_partition')

output_full_path: str = dbutils.widgets.get('output_delta_table')
input_full_path: str = dbutils.widgets.get('input_file_path')
schema_full_path: str = dbutils.widgets.get('schema_config_file_path')

pool_name: str = dbutils.widgets.get('pool_name')
pipeline_run_id: str = dbutils.widgets.get('pipeline_run_id')
pipeline_name: str = dbutils.widgets.get('pipeline_name')
data_factory_name: str = dbutils.widgets.get('data_factory_name')
  
delimiter: str = dbutils.widgets.get('delimiter')
header: str = dbutils.widgets.get('header')
encoding: str = dbutils.widgets.get('encoding')
multiLine: str = dbutils.widgets.get('multiLine')
    
storage_account_name = os.getenv('STORAGE_ACCOUNT')

fields_to_clean = []
if any(',' in c for c in dbutils.widgets.get('fields_to_clean')):
    fields_to_clean = dbutils.widgets.get('fields_to_clean').split(",")
elif (len(dbutils.widgets.get('fields_to_clean').strip()) > 0):
    fields_to_clean = [dbutils.widgets.get('fields_to_clean')]

#monitor= LIN_Monitor(f'CSV_TO_DELTA', data_factory_name, pipeline_run_id, pipeline_name, LIN_get_source(input_full_path), LIN_Stage.SILVER)

In [0]:
if init :
  # Suppression des tables delta dans silver
  print(output_full_path)
  try : 
    dbutils.fs.ls(output_full_path)
    dbutils.fs.rm(output_full_path, True)
  except : 
    print(f"folder {output_full_path} doesn't exist")
    
  # Suppression des paramètres de dernier chargement pour tout recharger.
  LIN_delete_loading_parameter(storage_account_name, input_full_path)

In [0]:
import json
from pyspark.sql.types import StructType

schema: StructType = StructType.fromJson(json.loads(   dbutils.fs.head(schema_full_path)))

In [0]:
from pyspark.sql.readwriter import DataFrameReader
from pyspark.sql.functions import regexp_replace

try:

  df_reader: DataFrameReader = spark.read.option("multiLine", multiLine).option('basePath', input_full_path).format("csv").option('header', header).option('delimiter', delimiter).option('encoding', encoding) 
  
  # Lecture du fichier et ajout des colonnes filePath, folderDatetime.
  df = (
      df_reader.schema(schema).load(input_full_path+folder_partition+ '/*.csv') 
        .withColumn('filePath', f.input_file_name())
        .withColumn('folderDatetime', f.to_timestamp(f.concat((f.col('year') * 10000 + f.col('month') * 100 + f.col('day')).cast('string'), f.lit(' '), f.format_string("%06d", f.col('time') * 100)), 'yyyyMMdd HHmmss'))

  )
  
  for field in fields_to_clean:
    df = df.withColumn(field, regexp_replace(field, '\\\\', '\\\\\\\\'))
  
  # Récupérer la date du dernier chargement pour filtrer les répertoires à charger.
  last_load = LIN_get_loading_parameter(storage_account_name, input_full_path)
  if last_load:
    df = df.filter(f.col('folderDatetime') > last_load.strftime('%Y-%m-%d %H:%M:%S'))

  # récupération des noms de fichiers source distincts, permettra de calculer la taille totale des fichier traités.
  spark.sparkContext.setLocalProperty("spark.scheduler.pool", pool_name)

  data_to_load = df.drop('filePath', 'year', 'month', 'day', 'time', 'folderDatetime')
  
  # Ajouter les colonnes de metadata
  data_to_load = LIN_add_metadata_cols(data_to_load)
  data_to_load = data_to_load.select([f.col(col).alias(col.replace(' ', '')) for col in data_to_load.columns])

  # Ajouter un accumulateur pour compter les lignes sans faire un count.
  accumulator_count = spark.sparkContext.accumulator(0)

  def LIN_add_to_accu(x):
    accumulator_count.add(1)
    return x

  rdd = data_to_load.rdd.map(lambda x: LIN_add_to_accu(x))
  currated_df = sqlContext.createDataFrame(rdd, data_to_load.schema)
  
  try :
    # Définir le pool utilisé.
    spark.sparkContext.setLocalProperty("spark.scheduler.pool", pool_name)
    # Ecriture dans la table delta.
    currated_df.write.format("delta").mode("append").partitionBy("IngestionDate").save(output_full_path)
  
  except:
    # mettre log
    raise 

  # Si il n'y a pas d'erreur pour le chargement des repertoires, on met a jour le parametre du fichier.
  LIN_set_loading_parameter(storage_account_name, input_full_path, start_date)

  #monitor.log_loading_metrics(LIN_Status.SUCCESS, input_full_path, output_full_path, start_date, datetime.utcnow(), '', accumulator_count.value, input_file_size=input_files_size, output_file_size=(after_write_table_size-before_write_table_size))

except:
  #monitor.log_loading_metrics(LIN_Status.FAILURE, input_full_path, output_full_path, start_date, datetime.utcnow(), LIN_get_error_message())
  raise 