In [1]:
import re
from pyspark.sql import SparkSession
from functools import reduce
from pyspark.sql.functions import col, regexp_replace, trim, when
from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
from datetime import datetime
import subprocess
from py4j.java_gateway import java_import
import os


In [2]:
spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("sales_transactions")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .getOrCreate()

sc = spark.sparkContext

In [3]:
now = datetime.now()
date_str = now.strftime("%Y%m%d")
hour_str = now.strftime("%H")

In [4]:
input = spark.read.parquet(f"hdfs:///data/retail_bronze/{date_str}/{hour_str}/sales_transactions_SS_raw_{date_str}_{hour_str}.parquet")
input.show(5)

+----------------+----------------+-----------+--------------+--------------+--------------------+--------------+---------+----------+------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+----+------+
|transaction_date|  transaction_id|customer_id|customer_fname|cusomter_lname|      cusomter_email|sales_agent_id|branch_id|product_id|product_name|product_category|offer_1|offer_2|offer_3|offer_4|offer_5|units|unit_price|is_online|payment_method|shipping_address|logs|source|
+----------------+----------------+-----------+--------------+--------------+--------------------+--------------+---------+----------+------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+----+------+
|       2022-7-19|trx-878108770002|      85513|     Alexander|       Johnson|alexander.johnson...|          10.0|      6.0|        27|        Iron|      Appliances|   null|

In [5]:
df_cleaned = input.withColumn("cleaned_email", trim(regexp_replace(col("cusomter_email"), r'[;*#()&}$\[\]\{&"\'\/>:\%\^|<\\]+.*$', '')))

df_cleaned = df_cleaned.withColumn("customer_email", regexp_replace(col("cleaned_email"), r'\.comm$', '.com'))
df_cleaned.select("customer_email").show(truncate=False)

+---------------------------+
|customer_email             |
+---------------------------+
|alexander.johnson@yahoo.com|
|ava.smith@gmail.com        |
|mia.brown@hotmail.com      |
|olivia.taylor@outlook.com  |
|john.moore@gmail.com       |
|john.brown@hotmail.com     |
|emma.smith@gmail.com       |
|emma.miller@outlook.com    |
|mia.williams@yahoo.com     |
|william.brown@hotmail.com  |
|olivia.smith@yahoo.com     |
|sophia.davis@yahoo.com     |
|michael.johnson@hotmail.com|
|emma.davis@gmail.com       |
|william.wilson@hotmail.com |
|john.miller@hotmail.com    |
|william.moore@outlook.com  |
|emma.brown@gmail.com       |
|mia.davis@outlook.com      |
|william.moore@gmail.com    |
+---------------------------+
only showing top 20 rows



In [6]:
df_cleaned = df_cleaned.withColumn("discount_perc", when(col("offer_1") == "True", 5). 
                                   when(col("offer_2") == "True", 10).
                                   when(col("offer_3") == "True", 15).
                                   when(col("offer_4") == "True", 20).
                                   when(col("offer_5") == "True", 25).
                                   otherwise(0))
df_modified = df_cleaned.drop("cleaned_email", "cusomter_email")
df_modified = df_modified.withColumnRenamed("cusomter_lname", "customer_lname")

In [7]:
df_modified.repartition(1).write.mode('overwrite').parquet(f"hdfs:///data/retail_silver/{date_str}/{hour_str}")

In [8]:
list_files_process = subprocess.run(["hadoop", "fs", "-ls", "-C", f"hdfs:///data/retail_silver/{date_str}/{hour_str}/"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)                        

In [9]:
if list_files_process.returncode != 0:
        print(f"Error listing files in hdfs:///data/retail_silver/{date_str}/{hour_str}: {list_files_process.stderr.decode()}")
else:
    stdout_str = list_files_process.stdout.decode()
    file_list = stdout_str.splitlines()
    file_to_rename = file_list[1]
    print(file_list[1])
    new_filename =f"hdfs:///data/retail_silver/{date_str}/{hour_str}/sales_transactions_SS_cleaned_{date_str}_{hour_str}.parquet"
    subprocess.run(["hadoop", "fs", "-mv", file_to_rename, new_filename])


hdfs:///data/retail_silver/20240712/08/part-00000-3ad39dc3-d81b-4df8-a1ab-3cf256199969-c000.snappy.parquet


In [10]:
df_modified

transaction_date,transaction_id,customer_id,customer_fname,customer_lname,sales_agent_id,branch_id,product_id,product_name,product_category,offer_1,offer_2,offer_3,offer_4,offer_5,units,unit_price,is_online,payment_method,shipping_address,logs,source,customer_email,discount_perc
2022-7-19,trx-878108770002,85513,Alexander,Johnson,10.0,6.0,27,Iron,Appliances,,,,,,7,29.99,no,Cash,,,,alexander.johnson...,0
2023-8-6,trx-349443438637,85510,Ava,Smith,2.0,6.0,28,Hair Dryer,Appliances,,True,,,,10,19.99,no,Cash,,,,ava.smith@gmail.com,10
2022-12-28,trx-045891300294,85553,Mia,Brown,6.0,4.0,13,Printer,Electronics,,,True,,,2,149.99,no,Credit Card,,,,mia.brown@hotmail...,15
2023-6-28,trx-756996252944,85520,Olivia,Taylor,3.0,2.0,12,Monitor,Electronics,,,,,,6,299.99,no,Credit Card,,,,olivia.taylor@out...,0
2023-9-5,trx-491216466700,85539,John,Moore,8.0,3.0,5,T-Shirt,Clothing,True,,,,,1,19.99,no,Credit Card,,,,john.moore@gmail.com,5
2023-3-16,trx-274239612034,85517,John,Brown,7.0,4.0,15,Hoodie,Clothing,,,,True,,3,29.99,no,Credit Card,,,,john.brown@hotmai...,20
2022-9-15,trx-054194579945,85496,Emma,Smith,11.0,1.0,19,Sandals,Footwear,,,,,,7,29.99,no,Credit Card,,,,emma.smith@gmail.com,0
2022-9-24,trx-217671445546,85469,Emma,Miller,6.0,3.0,13,Printer,Electronics,,,,,,3,149.99,no,Cash,,,,emma.miller@outlo...,0
2022-9-8,trx-765298457963,85529,Mia,Williams,10.0,3.0,23,Toaster,Appliances,,,,,,9,39.99,no,Credit Card,,,,mia.williams@yaho...,0
2022-12-9,trx-469050698996,85523,William,Brown,7.0,1.0,25,Washing Machine,Appliances,,,,,,1,499.99,no,Cash,,,,william.brown@hot...,0


In [11]:
spark.stop()