In [155]:
import re
from pyspark.sql import SparkSession
from functools import reduce
from pyspark.sql.functions import col, regexp_replace, trim
from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
from datetime import datetime

In [156]:
spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("sales_transactions")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .getOrCreate()

sc = spark.sparkContext

In [157]:
now = datetime.now()
date_str = now.strftime("%Y%m%d")
hour_str = now.strftime("%H")

In [158]:
input = spark.read.csv(f"hdfs:///data/retail_bronze/{date_str}/{hour_str}/sales_transactions_SS_raw_{date_str}_{hour_str}.csv", header='true')
input.show(5)

+----------------+----------------+-----------+--------------+--------------+--------------------+--------------+---------+----------+-----------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+
|transaction_date|  transaction_id|customer_id|customer_fname|cusomter_lname|      cusomter_email|sales_agent_id|branch_id|product_id|     product_name|product_category|offer_1|offer_2|offer_3|offer_4|offer_5|units|unit_price|is_online|payment_method|shipping_address|
+----------------+----------------+-----------+--------------+--------------+--------------------+--------------+---------+----------+-----------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+
|      2022-12-13|trx-675189967400|      85552|         James|         Smith|james.smith@hotma...|             8|        3|        20|            Heels|        Footwear|   null|   null|   True|

In [159]:
df_cleaned = input.withColumn("cleaned_email", trim(regexp_replace(col("cusomter_email"), r'[;*#()&}$\[\]\{&"\'\/>:\%\^|<\\]+.*$', '')))

df_cleaned = df_cleaned.withColumn("customer_email", regexp_replace(col("cleaned_email"), r'\.comm$', '.com'))
df_cleaned.select("customer_email").show(truncate=False)

+---------------------------+
|customer_email             |
+---------------------------+
|james.smith@hotmail.com    |
|john.davis@gmail.com       |
|sophia.brown@hotmail.com   |
|michael.davis@hotmail.com  |
|sophia.taylor@gmail.com    |
|michael.smith@outlook.com  |
|john.johnson@outlook.com   |
|mia.johnson@gmail.com      |
|james.williams@gmail.com   |
|emma.wilson@outlook.com    |
|mia.jones@gmail.com        |
|alexander.smith@outlook.com|
|emma.miller@gmail.com      |
|ava.johnson@gmail.com      |
|michael.brown@gmail.com    |
|william.miller@yahoo.com   |
|ava.smith@gmail.com        |
|michael.davis@outlook.com  |
|ava.smith@gmail.com        |
|william.miller@hotmail.com |
+---------------------------+
only showing top 20 rows



In [160]:
df_cleaned = df_cleaned.withColumn("discount_perc", when(col("offer_1") == "True", 5). 
                                   when(col("offer_2") == "True", 10).
                                   when(col("offer_3") == "True", 15).
                                   when(col("offer_4") == "True", 20).
                                   when(col("offer_5") == "True", 25).
                                   otherwise(0))
df_modified = df_cleaned.drop("cleaned_email", "cusomter_email")

In [161]:
df_modified.write.csv(f"hdfs:///data/retail_silver/{date_str}/{hour_str}/sales_transactions_SS_cleaned_{date_str}_{hour_str}.csv", header = 'true')

In [162]:
df_modified

transaction_date,transaction_id,customer_id,customer_fname,cusomter_lname,sales_agent_id,branch_id,product_id,product_name,product_category,offer_1,offer_2,offer_3,offer_4,offer_5,units,unit_price,is_online,payment_method,shipping_address,customer_email,discount_perc
2022-12-13,trx-675189967400,85552,James,Smith,8,3,20,Heels,Footwear,,,True,,,10,59.99,no,Credit Card,,james.smith@hotma...,15
2023-2-22,trx-854274530672,85514,John,Davis,7,5,25,Washing Machine,Appliances,True,,,,,2,499.99,no,Cash,,john.davis@gmail.com,5
2022-8-11,trx-133048404295,85464,Sophia,Brown,8,5,29,Hair Straightener,Appliances,,,,,,4,39.99,no,Credit Card,,sophia.brown@hotm...,0
2023-7-8,trx-945140296146,85545,Michael,Davis,7,1,1,Laptop,Electronics,,,,True,,4,999.99,no,Credit Card,,michael.davis@hot...,20
2023-1-7,trx-309013678277,85558,Sophia,Taylor,10,3,22,Coffee Maker,Appliances,,True,,,,6,79.99,no,Cash,,sophia.taylor@gma...,10
2023-8-4,trx-304375793435,85463,Michael,Smith,9,1,9,Boots,Footwear,,,,,,5,129.99,no,Cash,,michael.smith@out...,0
2022-6-20,trx-561475327270,85484,John,Johnson,2,4,17,Blouse,Clothing,,,,True,,7,29.99,no,Credit Card,,john.johnson@outl...,20
2023-3-4,trx-443211888905,85497,Mia,Johnson,4,4,13,Printer,Electronics,,,,,,4,149.99,no,Credit Card,,mia.johnson@gmail...,0
2023-3-19,trx-128107621213,85530,James,Williams,9,1,27,Iron,Appliances,,True,,,,6,29.99,no,Cash,,james.williams@gm...,10
2023-8-1,trx-960122701579,85547,Emma,Wilson,9,5,13,Printer,Electronics,,,,,,3,149.99,no,Credit Card,,emma.wilson@outlo...,0


In [163]:
spark.stop()