In [7]:
import re
from pyspark.sql import SparkSession
from functools import reduce
from pyspark.sql.functions import col, regexp_replace, trim, when
from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
from datetime import datetime

In [2]:
spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("sales_transactions")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .getOrCreate()

sc = spark.sparkContext

In [3]:
now = datetime.now()
date_str = now.strftime("%Y%m%d")
hour_str = now.strftime("%H")

In [4]:
input = spark.read.csv(f"hdfs:///data/retail_bronze/{date_str}/{hour_str}/sales_transactions_SS_raw_{date_str}_{hour_str}.csv", header='true')
input.show(5)

+----------------+----------------+-----------+--------------+--------------+--------------------+--------------+---------+----------+---------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+
|transaction_date|  transaction_id|customer_id|customer_fname|cusomter_lname|      cusomter_email|sales_agent_id|branch_id|product_id|   product_name|product_category|offer_1|offer_2|offer_3|offer_4|offer_5|units|unit_price|is_online|payment_method|shipping_address|
+----------------+----------------+-----------+--------------+--------------+--------------------+--------------+---------+----------+---------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+
|      2023-10-18|trx-237976222990|      85517|           Mia|         Jones|mia.jones@outlook...|             6|        1|        10|        Sandals|        Footwear|   null|   null|   null|   null|

In [5]:
df_cleaned = input.withColumn("cleaned_email", trim(regexp_replace(col("cusomter_email"), r'[;*#()&}$\[\]\{&"\'\/>:\%\^|<\\]+.*$', '')))

df_cleaned = df_cleaned.withColumn("customer_email", regexp_replace(col("cleaned_email"), r'\.comm$', '.com'))
df_cleaned.select("customer_email").show(truncate=False)

+--------------------------+
|customer_email            |
+--------------------------+
|mia.jones@outlook.com     |
|michael.williams@yahoo.com|
|james.johnson@hotmail.com |
|william.moore@outlook.com |
|olivia.johnson@yahoo.com  |
|william.taylor@hotmail.com|
|john.wilson@yahoo.com     |
|james.miller@gmail.com    |
|sophia.moore@gmail.com    |
|john.wilson@yahoo.com     |
|sophia.wilson@yahoo.com   |
|emma.jones@hotmail.com    |
|emma.moore@gmail.com      |
|james.smith@yahoo.com     |
|mia.miller@hotmail.com    |
|john.johnson@hotmail.com  |
|james.smith@hotmail.com   |
|emma.moore@gmail.com      |
|emma.moore@hotmail.com    |
|alexander.moore@gmail.com |
+--------------------------+
only showing top 20 rows



In [8]:
df_cleaned = df_cleaned.withColumn("discount_perc", when(col("offer_1") == "True", 5). 
                                   when(col("offer_2") == "True", 10).
                                   when(col("offer_3") == "True", 15).
                                   when(col("offer_4") == "True", 20).
                                   when(col("offer_5") == "True", 25).
                                   otherwise(0))
df_modified = df_cleaned.drop("cleaned_email", "cusomter_email")

In [9]:
df_modified.write.csv(f"hdfs:///data/retail_silver/{date_str}/{hour_str}/sales_transactions_SS_cleaned_{date_str}_{hour_str}.csv", header = 'true')

In [10]:
df_modified

transaction_date,transaction_id,customer_id,customer_fname,cusomter_lname,sales_agent_id,branch_id,product_id,product_name,product_category,offer_1,offer_2,offer_3,offer_4,offer_5,units,unit_price,is_online,payment_method,shipping_address,customer_email,discount_perc
2023-10-18,trx-237976222990,85517,Mia,Jones,6,1,10,Sandals,Footwear,,,,,True,1,39.99,no,Cash,,mia.jones@outlook...,25
2022-5-8,trx-891084445200,85497,Michael,Williams,9,2,27,Iron,Appliances,,,True,,,1,29.99,no,Credit Card,,michael.williams@...,15
2022-11-20,trx-911377237189,85485,James,Johnson,6,5,9,Boots,Footwear,,,,,,8,129.99,no,Cash,,james.johnson@hot...,0
2023-5-18,trx-121263458494,85468,William,Moore,9,6,1,Laptop,Electronics,True,,,,,4,999.99,no,Credit Card,,william.moore@out...,5
2022-2-7,trx-617255070261,85535,Olivia,Johnson,4,4,30,Electric Kettle,Appliances,,,,,,3,24.99,no,Cash,,olivia.johnson@ya...,0
2022-2-4,trx-110203063855,85545,William,Taylor,5,2,9,Boots,Footwear,,,,,,7,129.99,no,Cash,,william.taylor@ho...,0
2022-9-8,trx-013322439991,85552,John,Wilson,8,5,29,Hair Straightener,Appliances,True,,,,,4,39.99,no,Credit Card,,john.wilson@yahoo...,5
2022-11-2,trx-443629484380,85515,James,Miller,6,5,28,Hair Dryer,Appliances,,True,,,,5,19.99,no,Credit Card,,james.miller@gmai...,10
2023-10-12,trx-922479550468,85494,Sophia,Moore,11,2,7,Dress,Clothing,,,True,,,5,59.99,no,Credit Card,,sophia.moore@gmai...,15
2022-11-5,trx-954632465852,85552,John,Wilson,2,2,24,Blender,Appliances,,,,,,3,49.99,no,Credit Card,,john.wilson@yahoo...,0


In [11]:
spark.stop()