In [0]:
from pyspark.sql import functions as f
from pyspark.sql import types as t
from delta.tables import DeltaTable
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('price-procissing').getOrCreate()

In [0]:
%run /Workspace/apache-spark/databricks-project-fmcg-sports/utils/utilities

In [0]:
# import file from s3 bucket
price_bronze = (
    spark.read.format('csv')
    .option('header', 'true')
    .option('inferSchema', 'true')
    .load(f'{s3_bucket}/gross_price/gross_price.csv')
    .withColumn('readTimeStamp', f.current_timestamp())
)


In [0]:
price_bronze.summary().display()

In [0]:
#price_bronze.select('product_id').distinct().count()
#price_bronze.groupBy('product_id').count().show()
price_bronze.display()
# month column data is not consistanet, date is in different format gold table has year only. 
# gross_price has negative number as well unknown numbers. 

In [0]:
# fix the date column
# fix the gross price column if value is not number then chagne to 0 if it is negativ then change it positive it should be double. 
price_bronze = price_bronze.withColumns({
    'month': f.coalesce(
        f.try_to_date(f.col("month"), "yyyy-MM-dd"),
        f.try_to_date(f.col("month"), "dd/MM/yyyy"),
        f.try_to_date(f.col("month"), "yyyy/MM/dd"),
        f.try_to_date(f.col("month"), "dd-MM-yyyy")
    ),
    'gross_price':
      f.when(
          f.col("gross_price").rlike(r'^-?\d+(\.\d+)?$'),
          f.when(f.col("gross_price").cast('double') < 0, f.col("gross_price").cast('double') * -1)
          .otherwise(f.col("gross_price").cast('double'))
      ).otherwise(0)
})
# price_bronze.limit(4).display()

# product table has the product_key column which is important,
# so we will join with it. 

# read the product table data
product_gold = spark.read.table(f'{catalog}.{silver_schema}.product')
joined_table = price_bronze.join(
    product_gold,
    on='product_id',
    how='inner'
).select("product_id","product_key", "month", "gross_price", "readTimeStamp")

# Save final data to gold table
joined_table.write\
    .format('delta')\
    .mode('overwrite')\
    .option('delta.enableChangeDataFeed','true')\
    .saveAsTable(f'{catalog}.{silver_schema}.gross_price')

In [0]:
# product table has the product_key column which is important,
# so we will join with it. 

# read the product table data
product_gold = spark.read.table(f'{catalog}.{silver_schema}.product')
product_gold = price_bronze.join(
    product_gold,
    on='product_id',
    how='inner'
).select("product_key", "month", "gross_price")

product_gold.write.format('delta')\
    .mode('overwrite')\
    .option('enableChangeDataFeed', 'true')\
    .saveAsTable(f'{catalog}.{gold_schema}.dim_sb_gross_price')

In [0]:
# product table has multple row for each product
# as per the business requirement there should be only one row
# so we will take the latest value for each product. 

# read sb gross price table 
sb_gross_price = spark.read.table(f'{catalog}.{gold_schema}.dim_sb_gross_price')
#sb_gross_price.display()

# keep only row using windows function
sb_gross_price = sb_gross_price.withColumns({
    "rank": f.rank().over(Window.partitionBy(f.col('product_key')).orderBy(f.col('month').desc())),
    "year": f.year(f.col('month')) 
}).filter(f.col('rank') == 1).select("product_key","year","gross_price") 

# now merge master gross price with sb gross price
# read tagate table as dleta table
dim_gross_price = DeltaTable.forName(spark, f'{catalog}.{gold_schema}.dim_gross_price')
dim_gross_price.alias('t').merge(
    sb_gross_price.alias('s'),
    f.col('t.product_code') == f.col('s.product_key')
).whenMatchedUpdate(
    set = {
        'year': f.col('s.year'), 
        'price_inr': f.col('s.gross_price'),
        'product_code': f.col('s.product_key')
    }
).whenNotMatchedInsert(
    values={
        'year': f.col('s.year'), 
        'price_inr': f.col('s.gross_price'),
        'product_code': f.col('s.product_key')
    }
).execute()


In [0]:
%sql
SELECT * FROM fmcg.gold.dim_gross_price;