In [0]:
from pyspark.sql import functions as f
from pyspark.sql import types as t
import logging
from delta.tables import DeltaTable
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('fmcg').getOrCreate()

In [0]:
%run /Workspace/apache-spark/databricks-project-fmcg-sports/utils/utilities

In [0]:
# read customer data
file_path = f'{s3_bucket}/{data_source}/customers.csv'
df_cust_bronze = (
    spark.read
    .format('csv')
    .option('header','true')
    .option('inferSchema','true')
    .load(file_path)
)

df_cust_bronze.display()


### As I see following issue with imported data
0. duplicate records
1. customer id is intiger it should be string, 
2. customer_name is not consistent it has leeding spaces, capatilisaiton etc
3. city name has speling mistakes.


In [0]:
# Duplicates
# check if there any duplicate
df_cust_bronze.groupBy('customer_id').count().filter(f.col('count') > 1).display()

# remove these duplicates
df_cust_silver = df_cust_bronze.dropDuplicates(['customer_id'])

In [0]:
# Create a city mapping df to join with original df
df_city = spark.createDataFrame(clean_city, ["city", "clean_city"])

# join both table and get city and trim the first name
df_cust_silver = (
    df_cust_silver
    .join(df_city, on="city", how='left')
    .select("customer_id", "customer_name", f.col('clean_city').alias('city'))
).withColumns({
    'customer_name': f.initcap(f.trim(f.col('customer_name')))
})
df_cust_silver.select('city').distinct().show()

In [0]:
# We see there are few null value in city, If we check other record customer name it has multiple cities. 
df_cust_silver.filter(f.col("city").isNull()).show()
df_cust_silver.join(
    df_cust_silver.filter(f.col("city").isNull()),
    on="customer_name",
    how='inner'
)#.display()

df_cust_silver.filter(f.col("customer_name").isin(
    df_cust_silver.filter(f.col("city").isNull()).select('customer_name')
    )
).orderBy("customer_name").show()

In [0]:
# here is infor of missing city of customers
missing_city = {
    '789521':'Bengalore',
    '789603':'Hyderabad',
    '789521':'New Delhi',
    '789403':'Hyderabad',
    '789420':'New Delhi'
}

# create a dataframe of missing cities

df_missing_city = spark.createDataFrame(
    [(k, v) for k, v in missing_city.items()],
    ['customer_id', 'fixed_city']
)

#df_missing_city.display()
# Join table and update null values with fixed city data
df_cust_silver = df_cust_silver.join(
    df_missing_city, on='customer_id', how='left'
).withColumn(
    'city', f.coalesce(f.col("city"), f.col('fixed_city'))
).drop('fixed_city')


In [0]:
# last the customer id is integer need to change it to string. 
# and this table has city with customer name so it is better merge them - 
# and gold schema has three additional column so we have to add them as well
# market - India, platform - Sport Bar, channel - Acquisition 

df_cust_silver = df_cust_silver.withColumns({
    'customer_name': f.concat_ws(' - ', f.col('customer_name'), f.coalesce(f.col("city"), f.lit('Unknown'))),
    'customer_id': f.col("customer_id").cast("string"),
    'market': f.lit('India'),
    'platform': f.lit('Sport Bar'),
    'channel': f.lit('Acquisition')
})

In [0]:
# Save data to silver layer
df_cust_silver.write\
    .format('delta')\
    .option('delta.enableChangeDataFeed','true')\
    .option('mergeSchema','true')\
    .mode('overwrite')\
    .saveAsTable(f'{catalog}.{silver_schema}.{data_source}')

In [0]:
%sql
SELECT * FROM fmcg.silver.customers;

In [0]:
# There are not further transformation required for this table need to save this to gold layer
df_cust_silver.write\
    .format('delta')\
    .option('mergeSchema','true')\
    .mode('overwrite')\
    .saveAsTable(f'{catalog}.{gold_schema}.sp_dim_{data_source}')

In [0]:
#Let's merge the both table now
# Target table must be delta table
parent_cust      = DeltaTable.forName(spark, 'fmcg.gold.dim_customers')
child_cust_table = spark.sql('SELECT * FROM fmcg.gold.sp_dim_customers').select(
    f.col('customer_id').alias('customer_code'),
    f.col('customer_name').alias('customer'),
    'city',
    'market',
    'platform',
    'channel'
)

parent_cust.alias('p').merge(
    source=child_cust_table.alias('c'),
    condition='p.customer_code = c.customer_code',
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

In [0]:
spark.table(f'{catalog}.{gold_schema}.dim_customers').groupBy('platform').count().show()