In [1]:
import re
from pyspark.sql import SparkSession
from functools import reduce
from pyspark.sql.functions import col, regexp_replace, trim, when
from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
from datetime import datetime
import subprocess
from py4j.java_gateway import java_import
import os


In [2]:
spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("sales_transactions")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .getOrCreate()

sc = spark.sparkContext

In [3]:
now = datetime.now()
date_str = now.strftime("%Y%m%d")
hour_str = now.strftime("%H")

In [4]:
input = spark.read.parquet(f"hdfs:///data/retail_bronze/{date_str}/{hour_str}/sales_transactions_SS_raw_{date_str}_{hour_str}.parquet")
input.show(5)

+----------------+----------------+-----------+--------------+--------------+--------------------+--------------+---------+----------+---------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+
|transaction_date|  transaction_id|customer_id|customer_fname|cusomter_lname|      cusomter_email|sales_agent_id|branch_id|product_id|   product_name|product_category|offer_1|offer_2|offer_3|offer_4|offer_5|units|unit_price|is_online|payment_method|shipping_address|
+----------------+----------------+-----------+--------------+--------------+--------------------+--------------+---------+----------+---------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+
|       2023-5-20|trx-152546429674|      85469|     Alexander|         Brown|alexander.brown@g...|           1.0|      2.0|        22|   Coffee Maker|      Appliances|   null|   null|   null|   null|

In [5]:
df_cleaned = input.withColumn("cleaned_email", trim(regexp_replace(col("cusomter_email"), r'[;*#()&}$\[\]\{&"\'\/>:\%\^|<\\]+.*$', '')))

df_cleaned = df_cleaned.withColumn("customer_email", regexp_replace(col("cleaned_email"), r'\.comm$', '.com'))
df_cleaned.select("customer_email").show(truncate=False)

+----------------------------+
|customer_email              |
+----------------------------+
|alexander.brown@gmail.com   |
|william.brown@gmail.com     |
|john.williams@gmail.com     |
|alexander.miller@yahoo.com  |
|john.brown@hotmail.com      |
|sophia.wilson@hotmail.com   |
|alexander.moore@yahoo.com   |
|alexander.wilson@hotmail.com|
|michael.miller@yahoo.com    |
|michael.brown@yahoo.com     |
|john.taylor@yahoo.com       |
|alexander.davis@yahoo.com   |
|michael.brown@hotmail.com   |
|james.smith@yahoo.com       |
|emma.johnson@hotmail.com    |
|john.johnson@yahoo.com      |
|james.davis@yahoo.com       |
|ava.miller@outlook.com      |
|olivia.brown@outlook.com    |
|sophia.brown@gmail.com      |
+----------------------------+
only showing top 20 rows



In [6]:
df_cleaned = df_cleaned.withColumn("discount_perc", when(col("offer_1") == "True", 5). 
                                   when(col("offer_2") == "True", 10).
                                   when(col("offer_3") == "True", 15).
                                   when(col("offer_4") == "True", 20).
                                   when(col("offer_5") == "True", 25).
                                   otherwise(0))
df_modified = df_cleaned.drop("cleaned_email", "cusomter_email")
df_modified = df_modified.withColumnRenamed("cusomter_lname", "customer_lname")

In [7]:
df_modified.repartition(1).write.mode('overwrite').parquet(f"hdfs:///data/retail_silver/{date_str}/{hour_str}")

In [8]:
list_files_process = subprocess.run(["hadoop", "fs", "-ls", "-C", f"hdfs:///data/retail_silver/{date_str}/{hour_str}/"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)                        

In [9]:
if list_files_process.returncode != 0:
        print(f"Error listing files in hdfs:///data/retail_silver/{date_str}/{hour_str}: {list_files_process.stderr.decode()}")
else:
    stdout_str = list_files_process.stdout.decode()
    file_list = stdout_str.splitlines()
    file_to_rename = file_list[1]
    print(file_list[1])
    new_filename =f"hdfs:///data/retail_silver/{date_str}/{hour_str}/sales_transactions_SS_cleaned_{date_str}_{hour_str}.parquet"
    subprocess.run(["hadoop", "fs", "-mv", file_to_rename, new_filename])


hdfs:///data/retail_silver/20240709/17/part-00000-649f7d29-e290-4222-89fd-a6c6250999d9-c000.snappy.parquet


In [10]:
df_modified

transaction_date,transaction_id,customer_id,customer_fname,customer_lname,sales_agent_id,branch_id,product_id,product_name,product_category,offer_1,offer_2,offer_3,offer_4,offer_5,units,unit_price,is_online,payment_method,shipping_address,customer_email,discount_perc
2023-10-25,trx-072037549384,85550,Emma,Wilson,2.0,3.0,3,Tablet,Electronics,,,,,,7,299.99,no,Cash,,emma.wilson@outlo...,0
2022-5-8,trx-125208155197,85512,Olivia,Miller,9.0,5.0,11,TV,Electronics,,,,,,3,899.99,no,Credit Card,,olivia.miller@out...,0
2023-5-3,trx-667682345565,85512,Olivia,Miller,7.0,2.0,10,Sandals,Footwear,,,,True,,3,39.99,no,Credit Card,,olivia.miller@out...,20
2022-1-9,trx-706068352444,85526,Sophia,Smith,2.0,4.0,2,Smartphone,Electronics,,,,,True,7,699.99,no,Credit Card,,sophia.smith@hotm...,25
2022-12-10,trx-040134528974,85535,Michael,Davis,9.0,3.0,16,Skirt,Clothing,,,,True,,7,39.99,no,Cash,,michael.davis@out...,20
2023-5-17,trx-197763314500,85555,John,Johnson,1.0,4.0,6,Jeans,Clothing,,,,,,7,49.99,no,Credit Card,,john.johnson@hotm...,0
2023-10-17,trx-390556792430,85486,Olivia,Smith,7.0,5.0,27,Iron,Appliances,,,,,,5,29.99,no,Cash,,olivia.smith@outl...,0
2022-2-16,trx-589236970173,85484,Olivia,Brown,2.0,1.0,21,Microwave,Appliances,,,,,,1,129.99,no,Cash,,olivia.brown@hotm...,0
2023-6-26,trx-741827755865,85531,Michael,Taylor,2.0,1.0,19,Sandals,Footwear,True,,,,,2,29.99,no,Cash,,michael.taylor@ho...,5
2023-8-1,trx-769118815563,85535,Michael,Davis,7.0,2.0,3,Tablet,Electronics,,,,,,7,299.99,no,Credit Card,,michael.davis@out...,0


In [11]:
spark.stop()