In [1]:
import re
from pyspark.sql import SparkSession
from functools import reduce
from pyspark.sql.functions import col, regexp_replace, trim, when
from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
from datetime import datetime
import subprocess
from py4j.java_gateway import java_import
import os
import sys

In [2]:
spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("Silver_layer_transformation")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .getOrCreate()

sc = spark.sparkContext

In [3]:
now = datetime.now()
date_str = now.strftime("%Y%m%d")
hour_str = now.strftime("%H")
path = f"hdfs:///data/retail_bronze/{date_str}/{hour_str}/sales_transactions_SS_raw_{date_str}_{hour_str}.parquet"

In [4]:
try:
    if os.system(f"hdfs dfs -test -e {path}") == 0:
        input_df = spark.read.parquet(path)
    else:
        raise SystemExit(f"Path does not exist: {path}")  # Exit with code 1 for missing path
except Exception as e:
    print(f"An error occurred: {e}")
    raise SystemExit(1)  # Exit with code 1 for other errors

In [5]:
df_cleaned = input_df.withColumn("cleaned_email", trim(regexp_replace(col("cusomter_email"), r'[;*#()&}$\[\]\{&"\'\/>:\%\^|<\\]+.*$', '')))

df_cleaned = df_cleaned.withColumn("customer_email", regexp_replace(col("cleaned_email"), r'\.comm$', '.com'))
df_cleaned.select("customer_email").show(truncate=False)

+----------------------------+
|customer_email              |
+----------------------------+
|alexander.brown@gmail.com   |
|william.brown@gmail.com     |
|john.williams@gmail.com     |
|alexander.miller@yahoo.com  |
|john.brown@hotmail.com      |
|sophia.wilson@hotmail.com   |
|alexander.moore@yahoo.com   |
|alexander.wilson@hotmail.com|
|michael.miller@yahoo.com    |
|michael.brown@yahoo.com     |
|john.taylor@yahoo.com       |
|alexander.davis@yahoo.com   |
|michael.brown@hotmail.com   |
|james.smith@yahoo.com       |
|emma.johnson@hotmail.com    |
|john.johnson@yahoo.com      |
|james.davis@yahoo.com       |
|ava.miller@outlook.com      |
|olivia.brown@outlook.com    |
|sophia.brown@gmail.com      |
+----------------------------+
only showing top 20 rows



In [6]:
df_cleaned = df_cleaned.withColumn("discount_perc", when(col("offer_1") == "True", 5). 
                                   when(col("offer_2") == "True", 10).
                                   when(col("offer_3") == "True", 15).
                                   when(col("offer_4") == "True", 20).
                                   when(col("offer_5") == "True", 25).
                                   otherwise(0))
df_modified = df_cleaned.drop("cleaned_email", "cusomter_email")
df_modified = df_modified.withColumnRenamed("cusomter_lname", "customer_lname")

In [7]:
df_modified.repartition(1).write.mode('overwrite').parquet(f"hdfs:///data/retail_silver/{date_str}/{hour_str}")

In [8]:
list_files_process = subprocess.run(["hadoop", "fs", "-ls", "-C", f"hdfs:///data/retail_silver/{date_str}/{hour_str}/"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)                        

In [9]:
if list_files_process.returncode != 0:
        print(f"Error listing files in hdfs:///data/retail_silver/{date_str}/{hour_str}: {list_files_process.stderr.decode()}")
else:
    stdout_str = list_files_process.stdout.decode()
    file_list = stdout_str.splitlines()
    file_to_rename = file_list[1]
    print(file_list[1])
    new_filename =f"hdfs:///data/retail_silver/{date_str}/{hour_str}/sales_transactions_SS_cleaned_{date_str}_{hour_str}.parquet"
    subprocess.run(["hadoop", "fs", "-mv", file_to_rename, new_filename])


hdfs:///data/retail_silver/20240713/07/part-00000-c08627f5-09ba-4333-b7fa-2f3e2722953f-c000.snappy.parquet


In [10]:
df_modified

transaction_date,transaction_id,customer_id,customer_fname,customer_lname,sales_agent_id,branch_id,product_id,product_name,product_category,offer_1,offer_2,offer_3,offer_4,offer_5,units,unit_price,is_online,payment_method,shipping_address,customer_email,discount_perc
2023-5-20,trx-152546429674,85469,Alexander,Brown,1.0,2.0,22,Coffee Maker,Appliances,,,,,,10,79.99,no,Cash,,alexander.brown@g...,0
2022-10-25,trx-291375327542,85512,William,Brown,3.0,1.0,24,Blender,Appliances,,,,True,,5,49.99,no,Cash,,william.brown@gma...,20
2022-2-5,trx-312507679871,85484,John,Williams,10.0,3.0,4,Headphones,Electronics,,,,,,1,99.99,no,Credit Card,,john.williams@gma...,0
2023-10-20,trx-193384855491,85528,Alexander,Miller,7.0,2.0,25,Washing Machine,Appliances,,,,,,8,499.99,no,Cash,,alexander.miller@...,0
2022-11-17,trx-831626097654,85500,John,Brown,5.0,1.0,14,Camera,Electronics,,,True,,,10,399.99,no,Cash,,john.brown@hotmai...,15
2022-9-27,trx-158496122054,85545,Sophia,Wilson,4.0,5.0,14,Camera,Electronics,,,,,True,6,399.99,no,Credit Card,,sophia.wilson@hot...,25
2022-4-21,trx-722817999024,85561,Alexander,Moore,4.0,1.0,30,Electric Kettle,Appliances,,,,True,,6,24.99,no,Credit Card,,alexander.moore@y...,20
2023-4-28,trx-813287633702,85520,Alexander,Wilson,1.0,1.0,26,Vacuum Cleaner,Appliances,,,,,,4,199.99,no,Cash,,alexander.wilson@...,0
2023-3-8,trx-219568257432,85488,Michael,Miller,6.0,2.0,18,Boots,Footwear,,,,,,10,149.99,no,Credit Card,,michael.miller@ya...,0
2023-6-17,trx-352160720823,85466,Michael,Brown,5.0,2.0,16,Skirt,Clothing,,,,,,8,39.99,no,Cash,,michael.brown@yah...,0


In [11]:
spark.stop()