Notebook permettant de charger la source de données WEB.

Initialization: true / false, permet de faire un init / run du chargement. Un init efface le paramètre qui stocke le timestamp du dernier chargement réussi et la table destination dans silver, avant de faire le chargement.

In [0]:
%run ./Utils/COMMON

In [0]:
import os
from datetime import datetime

dbutils.widgets.text('init', 'false', 'Initialization')

init: bool = (dbutils.widgets.get('init') == 'true')
start_date = datetime.utcnow()
input_path = "dbfs:/mnt/dlk/bronze/WEB/eurofxref/"
output_path = "dbfs:/mnt/dlk/silver/WEB/eurofxref"
storage_account_name = os.getenv('STORAGE_ACCOUNT')

In [0]:
if init :
  # Suppression des tables delta dans silver
  print(output_path)
  try : 
    dbutils.fs.ls(output_path)
    dbutils.fs.rm(output_path, True)
  except : 
    print(f"folder {output_path} doesn't exist")
    
  # Suppression des paramètres de dernier chargement pour tout recharger.
  LIN_delete_loading_parameter(storage_account_name, input_path)

In [0]:
from pyspark.sql.readwriter import DataFrameReader

df_reader: DataFrameReader = spark.read.option('basePath', input_path).format("json") 

df = (df_reader
    .load(f"{input_path}/year=*/month=*/day=*/time=*/eurofxref.json", multiLine=True)
    .withColumn('filePath', f.input_file_name())
    .withColumn('folderDatetime', f.to_timestamp(f.concat((f.col('year') * 10000 + f.col('month') * 100 + f.col('day')).cast('string'), f.lit(' '), f.format_string("%06d", f.col('time') * 100)), 'yyyyMMdd HHmmss'))
)
  
# Récupérer la date du dernier chargement pour filtrer les répertoires à charger.
last_load = LIN_get_loading_parameter(storage_account_name, input_path)
print(last_load)
if last_load:
    df = df.filter(f.col('folderDatetime') > last_load.strftime('%Y-%m-%d %H:%M:%S'))

if os.getenv('ENVIRONMENT') == 'dev':
    df.printSchema()

In [0]:
from pyspark.sql.functions import *

df = (df.select(explode("gesmes:Envelope.cube.cube").alias("c"))
    .select(to_date(col("c.@time"),"yyyy-MM-dd").alias("Date"),explode("c.cube").alias("cb"),col("cb.@currency").alias("Currencies"),col("cb.@rate").alias("Rate"))
    .drop("cb")
)
df = LIN_add_metadata_cols(df)

if os.getenv('ENVIRONMENT') == 'dev':
    display(df)

Date,Currencies,Rate,BronzeFileNameSource,BronzeFolderSource,IngestionDate


In [0]:
df.write.format("delta").mode("append").partitionBy("IngestionDate").save(output_path)

# Si il n'y a pas d'erreur pour le chargement des repertoires, on met a jour le parametre du fichier.
LIN_set_loading_parameter(storage_account_name, input_path, start_date)