In [0]:
# import section
from pyspark.sql import functions as f
from pyspark.sql import types as t
from pyspark.sql.window import Window
from delta.tables import DeltaTable
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('orders').getOrCreate()


In [0]:
%run /Workspace/apache-spark/databricks-project-fmcg-sports/utils/utilities

In [0]:
# setup the catlog, datasource and s3 paths
dbutils.widgets.text('catlog','fmcg','catlog')
dbutils.widgets.text('datasource','orders','datasource')

catlog      = dbutils.widgets.get('catlog')
datasource  = dbutils.widgets.get('datasource')

base_path   = f'{s3_bucket}/{datasource}'
landing_path = f'{base_path}/landing'
processed_path = f'{base_path}/processed'

#print(catlog,datasource,base_path, landing_path, processed_path)

In [0]:
# read orders full load
orders_bronze = (
    spark.read.format('csv')
    .option('header','true')
    .option('inferSchema','true')
    .load(f'{landing_path}/*.csv')
    .withColumns({
        'load_timestamp': f.current_timestamp(),
        'file_name': f.col('_metadata.file_name'),
        'file_size': f.col('_metadata.file_size'),
        'file_date': f.regexp_replace(
                f.regexp_extract(f.col('file_name'),r"(\d{4}_\d{2}_\d{2})",1),
                '_',
                '-'
        ).cast('date')
    })
)
orders_bronze.count()
orders_bronze.limit(10).display()

In [0]:
# save data to delta table
orders_bronze.write.mode('overwrite')\
.format('delta')\
.option('enableChangeDataFeed','true')\
.saveAsTable('fmcg.bronze.orders_bronze')

spark.sql('select count(*) from orders_bronze').display()

In [0]:
# now move the files from landing to processed folder
# extract all files
files = dbutils.fs.ls(landing_path)
if len(files) == 0:
    print('no files to move')
else:
    for file in files:
        dbutils.fs.mv(file.path, f"{processed_path}/{file.name}", True)


### Silver transformation

In [0]:
orders_silver = spark.read.table('fmcg.bronze.orders_bronze')
# remove the recors with order_qty is null
orders_silver = orders_silver.filter(f.col('order_qty').isNotNull())

orders_silver.display()

In [0]:
# verify duplicate count
orders_silver.filter(f.col("order_qty").isNull()).count()

orders_silver.withColumn(
    'product_id', f.when(f.col('product_id').rlike('^[0-9]$'), f.col('product_id').cast('int')).otherwise(99999).cast('int')
).display()

In [0]:
dis