In [51]:
import re
from pyspark.sql import SparkSession
from functools import reduce
from pyspark.sql.functions import col, regexp_replace, trim,when ,monotonically_increasing_id,lit,year, month, dayofmonth, weekofyear, dayofweek, date_format,floor,dense_rank,\
substring,concat,split
from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
from datetime import date, datetime, timedelta
import subprocess
from py4j.java_gateway import java_import
import os
from pyspark.sql.types import DateType

In [3]:
spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("Gold_layer_transformations")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .getOrCreate()

sc = spark.sparkContext

In [4]:
now = datetime.now()
date_str = now.strftime("%Y%m%d")
hour_str = now.strftime("%H")

In [5]:
input_trans = spark.read.csv(f"hdfs:///data/retail_silver/20240704/02/sales_transactions_SS_cleaned_20240704_02.csv", header='true')
#input = spark.read.csv(f"hdfs:///data/retail_silver/{date_str}/12/sales_transactions_SS_cleaned_{date_str}_12.csv", header='true')
input_trans.show(5)

+----------------+----------------+-----------+--------------+--------------+--------------+---------+----------+---------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+--------------------+-------------+
|transaction_date|  transaction_id|customer_id|customer_fname|cusomter_lname|sales_agent_id|branch_id|product_id|   product_name|product_category|offer_1|offer_2|offer_3|offer_4|offer_5|units|unit_price|is_online|payment_method|shipping_address|      customer_email|discount_perc|
+----------------+----------------+-----------+--------------+--------------+--------------+---------+----------+---------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+--------------------+-------------+
|       2023-5-20|trx-152546429674|      85469|     Alexander|         Brown|             1|        2|        22|   Coffee Maker|      Appliances|   null|   

In [37]:
print(input_trans.columns)


['transaction_date', 'transaction_id', 'customer_id', 'customer_fname', 'cusomter_lname', 'sales_agent_id', 'branch_id', 'product_id', 'product_name', 'product_category', 'offer_1', 'offer_2', 'offer_3', 'offer_4', 'offer_5', 'units', 'unit_price', 'is_online', 'payment_method', 'shipping_address', 'customer_email', 'discount_perc']


In [32]:
#function_to_rename_in_hdfs
def rename_in_hdfs(golden_layer_path,file_extension,name):
    # Run the Hadoop fs -ls command to list files
    list_files_process = subprocess.run(["hadoop", "fs", "-ls", golden_layer_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Check for errors
    if list_files_process.returncode != 0:
        print(f"Error listing files in {golden_layer_path}: {list_files_process.stderr.decode()}")
        exit(1)

    # Decode stdout to string format and split lines
    stdout_str = list_files_process.stdout.decode()
    file_list = stdout_str.splitlines()

    # Find the file to rename based on criteria
    file_to_rename = None
    for line in file_list:
        if line.endswith(file_extension):
            file_to_rename = line.split()[-1].strip()
            break

    # Check if a file matching the criteria was found
    if file_to_rename:
        #new_directory = f"{golden_layer_path}/{name}"
        new_filename = f"{golden_layer_path}/{name}_{date_str}_{hour_str}{file_extension}"

        # Create directory if it doesn't exist
        #subprocess.run(["hadoop", "fs", "-mkdir", "-p", new_directory])

        # Move (rename) the file
        subprocess.run(["hadoop", "fs", "-mv", file_to_rename, new_filename])

        print(f"File moved and renamed to: {new_filename}")
    else:
        print("File matching the criteria not found.")

In [38]:
#write customer dim in HDFS

cust_dim = input_trans.dropDuplicates(['customer_id'])
# Add monotonically increasing id column
cust_dim = cust_dim.withColumn('customer_sur_key', monotonically_increasing_id())

#to write cust_dim in one file 
cust_dim=cust_dim.repartition(1)
golden_layer_path="hdfs:///data/golden_layer/cust_dim"
file_extension = ".csv"
name='cust_dim'

#make customer dim 
cust_dim=cust_dim.select('customer_sur_key','customer_id', 'customer_fname', 'cusomter_lname', 'customer_email') 
cust_dim.write.mode('overwrite') \
        .option("header", "true") \
        .format('csv') \
        .save(golden_layer_path)

# Define your directory path and file criteria
#directory_path = customer_dim_path
#golden_layer_path="hdfs:///data/golden_layer"
file_extension = ".csv"
name='cust_dim'
rename_in_hdfs(golden_layer_path,file_extension,name)
cust_dim.take(5)

File moved and renamed to: hdfs:///data/golden_layer/cust_dim/cust_dim_20240706_095920.csv


[Row(customer_sur_key=42949672960, customer_id='85485', customer_fname='Ava', cusomter_lname='Smith', customer_email='ava.smith@hotmail.com'),
 Row(customer_sur_key=51539607552, customer_id='85529', customer_fname='Mia', cusomter_lname='Davis', customer_email='mia.davis@gmail.com'),
 Row(customer_sur_key=68719476736, customer_id='85509', customer_fname='Michael', cusomter_lname='Jones', customer_email='michael.jones@outlook.com'),
 Row(customer_sur_key=85899345920, customer_id='85547', customer_fname='James', cusomter_lname='Wilson', customer_email='james.wilson@outlook.com'),
 Row(customer_sur_key=120259084288, customer_id='85476', customer_fname='James', cusomter_lname='Moore', customer_email='james.moore@yahoo.com')]

In [39]:
#product_dim
product_dim=input_trans.dropDuplicates(['product_id'])
product_dim = product_dim.withColumn('product_sur_key', monotonically_increasing_id())
#to write product_dim in one file 
product_dim=product_dim.repartition(1)


golden_layer_path="hdfs:///data/golden_layer/prodcut_dim"
file_extension = ".csv"
name='product_dim'

#make customer dim 
product_dim=product_dim.select('product_sur_key','product_id', 'product_name', 'product_category') 
product_dim.write.mode('overwrite') \
            .option("header", "true") \
            .format('csv') \
            .save(golden_layer_path)

rename_in_hdfs(golden_layer_path,file_extension,name)



File moved and renamed to: hdfs:///data/golden_layer/prodcut_dim/product_dim_20240706_095952.csv


In [41]:
#branches Dim
# Define the file path for the initial CSV data and the golden layer path on HDFS
file_path = f"hdfs:///data/retail_bronze/{date_str}/{hour_str}/branches_SS_raw_{date_str}__{date_str}_{hour_str}.csv"
golden_layer_path = "hdfs:///data/golden_layer/branches_dim"
manual_path=f"hdfs:///data/retail_bronze/20240704/02/branches_SS_raw_20240703__20240704_02.csv"
file_extension = ".csv"
name="branches_dim"
# Create the "golden layer" directory on HDFS if it doesn't exist
subprocess.run(["hadoop", "fs", "-mkdir", "-p", golden_layer_path])

# Load the CSV data into a PySpark DataFrame
branches_dim = spark.read.option("header", "true").csv(manual_path)

# Convert establish_date to date type if needed
branches_dim = branches_dim.withColumn("establish_date", col("establish_date").cast("date"))

# Add a surrogate key column
branches_dim = branches_dim.withColumn("surrogate_key", monotonically_increasing_id())

# Show the DataFrame with the surrogate key
branches_dim.show()

# Save the DataFrame to the golden layer folder on HDFS in CSV format
branches_dim.write.option("header", "true").mode("overwrite").csv(golden_layer_path)
rename_in_hdfs(golden_layer_path,file_extension,name)

+---------+-----------+--------------+-----+-------------+
|branch_id|   location|establish_date|class|surrogate_key|
+---------+-----------+--------------+-----+-------------+
|        1|   New York|    2017-01-15|    A|            0|
|        2|Los Angeles|    2016-07-28|    B|            1|
|        3|    Chicago|    2015-03-10|    A|            2|
|        4|    Houston|    2016-11-05|    D|            3|
|        5|    Phoenix|    2017-09-20|    C|            4|
+---------+-----------+--------------+-----+-------------+

File moved and renamed to: hdfs:///data/golden_layer/branches_dim/branches_dim_20240706_100246.csv


In [42]:
#agent dim
# Define the file path for the initial CSV data and the golden layer path on HDFS
file_path = f"hdfs:///data/retail_bronze/{date_str}/{hour_str}/sales_agents_SS_raw_{date_str}__{date_str}_{hour_str}.csv"
golden_layer_path = "hdfs:///data/golden_layer/agent_DIM"
manual_path="/data/retail_bronze/20240704/02/sales_agents_SS_raw_20240703__20240704_02.csv"
file_extension = ".csv"
name="agent"
# Create the "golden layer" directory on HDFS if it doesn't exist
subprocess.run(["hadoop", "fs", "-mkdir", "-p", golden_layer_path])

# Load the CSV data into a PySpark DataFrame
agent_dim = spark.read.option("header", "true").csv(manual_path)

# Convert establish_date to date type if needed
agent_dim = agent_dim.withColumn("hire_date", col("hire_date").cast("date"))

# Add a surrogate key column
agent_dim = agent_dim.withColumn("surrogate_key", monotonically_increasing_id())
agent_dim.write.option("header", "true").mode("overwrite").csv(golden_layer_path)


# Show the DataFrame with the surrogate key
agent_dim.show()

rename_in_hdfs(golden_layer_path,file_extension,name)



+---------------+------------------+----------+-------------+
|sales_person_id|              name| hire_date|surrogate_key|
+---------------+------------------+----------+-------------+
|              1|          John Doe|2020-06-03|            0|
|              2|        Jane Smith|2018-05-13|            1|
|              3|   Michael Johnson|2021-10-03|            2|
|              4|       Emily Brown|2020-10-25|            3|
|              5|      David Wilson|2021-04-08|            4|
|              6|       Emma Taylor|2019-03-28|            5|
|              7|Christopher Miller|2020-01-11|            6|
|              8|      Olivia Davis|2021-10-24|            7|
|              9|   Daniel Martinez|2018-10-08|            8|
|             10|      Sophia Moore|2019-05-25|            9|
+---------------+------------------+----------+-------------+

File moved and renamed to: hdfs:///data/golden_layer/agent_DIM/agent_20240706_100624.csv


In [43]:
#create date dimension
# Generate date range

start_date = date(2022, 1, 1)
end_date = date(2024, 12, 31)

date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
date_df = spark.createDataFrame([(d,) for d in date_range], ["date"]).withColumn("date", col("date").cast(DateType()))

# Add date attributes
date_dim = date_df.withColumn("year", year(col("date"))) \
    .withColumn("month", month(col("date"))) \
    .withColumn("day", dayofmonth(col("date"))) \
    .withColumn("week", weekofyear(col("date"))) \
    .withColumn("weekday", dayofweek(col("date"))) \
    .withColumn("quarter", floor((month(col("date")) - 1) / 3) + 1) \
    .withColumn("day_name", date_format(col("date"), "EEEE")) \
    .withColumn("month_name", date_format(col("date"), "MMMM")) \
    .withColumn("is_weekend", when(col("weekday").isin([1, 7]), lit(1)).otherwise(lit(0)))

# Add surrogate key column
date_dim = date_dim.withColumn("date_sur_key", concat(col('day'),col('month'),col('year')))


# Define the output directory for the date dimension
date_dim_path = "hdfs:///data/golden_layer/date_dim"

try:
    # Write the date dimension to a single CSV file
    date_dim.repartition(1) \
        .write.mode('overwrite') \
        .option("header", "true") \
        .format('csv') \
        .save(date_dim_path)
    print(f"Date dimension table saved to {date_dim_path}")
except Exception as e:
    print(f"An error occurred: {e}")
    
#to rename csv file in date dim
#date_dim_path="hdfs:///data/golden_layer"
file_extension = ".csv"
name="date_dim"
rename_in_hdfs(date_dim_path,file_extension,name)

Date dimension table saved to hdfs:///data/golden_layer/date_dim
File moved and renamed to: hdfs:///data/golden_layer/date_dim/date_dim_20240706_100817.csv


In [45]:
#try to create fact table 
#fact One (offline)
#print(input.columns)
offline_fact=input_trans.filter(col('is_online')=="no")
columns_to_drop=['shipping_address','customer_fname','cusomter_lname','sales_agent_id','offer_1','offer_2',
                'offer_3','offer_4','offer_5','product_name','product_category','customer_email']
offline_fact=offline_fact.drop(*columns_to_drop)
print(offline_fact.columns)
print(offline_fact.take(5))
#print(cust_dim.columns)
offline_fact=offline_fact.withColumn("transaction_date", col("transaction_date").cast(DateType()))
final_price=(col('units') * col('unit_price') * (1 - col('discount_perc') / 100))
offline_fact=offline_fact.withColumn("total_price",final_price)
offline_fact=cust_dim.join(offline_fact,on='customer_id',how='left').join(product_dim,on='product_id',how='right').\
                    join(date_dim,date_dim.date == offline_fact.transaction_date,'left')
#offline_fact=product_dim.join(offline_fact,on='product_id',how='left')
offline_fact
offline_fact = offline_fact.select(
    'transaction_id',
    'units',
    'payment_method',
    'discount_perc',
    'total_price',
    'transaction_date',     
    'customer_sur_key',
    'product_sur_key',
    'date_sur_key'
)

fact_off_dim_path="hdfs:///data/golden_layer/facts"
file = ".csv"
offline_fact=offline_fact.repartition(1)
offline_fact.write.mode('overwrite') \
            .option("header", "true") \
            .format('csv') \
            .save(fact_off_dim_path)

name="offline_fact"
rename_in_hdfs(fact_off_dim_path,file,name)

['transaction_date', 'transaction_id', 'customer_id', 'branch_id', 'product_id', 'units', 'unit_price', 'is_online', 'payment_method', 'discount_perc']
[Row(transaction_date='2023-5-20', transaction_id='trx-152546429674', customer_id='85469', branch_id='2', product_id='22', units='10', unit_price='79.99', is_online='no', payment_method='Cash', discount_perc='0'), Row(transaction_date='2022-10-25', transaction_id='trx-291375327542', customer_id='85512', branch_id='1', product_id='24', units='5', unit_price='49.99', is_online='no', payment_method='Cash', discount_perc='20'), Row(transaction_date='2022-2-5', transaction_id='trx-312507679871', customer_id='85484', branch_id='3', product_id='4', units='1', unit_price='99.99', is_online='no', payment_method='Credit Card', discount_perc='0'), Row(transaction_date='2023-10-20', transaction_id='trx-193384855491', customer_id='85528', branch_id='2', product_id='25', units='8', unit_price='499.99', is_online='no', payment_method='Cash', discount_

In [60]:
#online_fact
online_fact=input_trans.filter(col('is_online')=="yes")
columns_to_drop=['customer_fname','cusomter_lname','sales_agent_id','offer_1','offer_2',
                'offer_3','offer_4','offer_5','product_name','product_category','customer_email']
online_fact=online_fact.drop(*columns_to_drop)
#print(online_fact.columns)
#cast transaction_date to date type 
online_fact=online_fact.withColumn("transaction_date", col("transaction_date").cast(DateType()))
#calculate the final price 
final_price=(col('units') * col('unit_price') * (1 - col('discount_perc') / 100))
online_fact=online_fact.withColumn("total_price",final_price)
#processing address column
split_address_col=split(col("shipping_address"),'/')
online_fact=online_fact.withColumn('street',split_address_col.getItem(0))\
                        .withColumn('city',split_address_col.getItem(1))\
                        .withColumn('state',split_address_col.getItem(2))\
                        .withColumn('postal_code',split_address_col.getItem(3))
online_fact
#join dataframe togther 
online_fact=cust_dim.join(online_fact,on='customer_id',how='left').join(product_dim,on='product_id',how='right').\
                    join(date_dim,date_dim.date == online_fact.transaction_date,'left')
online_fact = online_fact.select(
    'transaction_id',
    'units',
    'payment_method',
    'discount_perc',
    'total_price',
    'transaction_date',     
    'customer_sur_key',
    'product_sur_key',
    'date_sur_key',
    'street',
    'city',
    'state',
    'postal_code'
)
print(online_fact.columns)
online_fact_path="hdfs:///data/golden_layer/online_fact"
file = ".csv"
online_fact=online_fact.repartition(1)
online_fact.write.mode('overwrite') \
            .option("header", "true") \
            .format('csv') \
            .save(online_fact_path)

name="online_fact"
rename_in_hdfs(online_fact_path,file,name)
online_fact.take(5)

['transaction_id', 'units', 'payment_method', 'discount_perc', 'total_price', 'transaction_date', 'customer_sur_key', 'product_sur_key', 'date_sur_key', 'street', 'city', 'state', 'postal_code']
File moved and renamed to: hdfs:///data/golden_layer/online_fact/online_fact_20240706_114342.csv


[Row(transaction_id='trx-416356251492', units='8', payment_method='Stripe', discount_perc='15', total_price=543.9319999999999, transaction_date=datetime.date(2022, 3, 28), customer_sur_key=481036337152, product_sur_key=249108103168, date_sur_key='2832022', street='5130 Morris Way', city='Fremont', state='CA', postal_code='94536'),
 Row(transaction_id='trx-994649210433', units='3', payment_method='PayPal', discount_perc='0', total_price=89.97, transaction_date=datetime.date(2023, 6, 22), customer_sur_key=652835028992, product_sur_key=60129542144, date_sur_key='2262023', street='814 East 10th Court', city='Panama City', state='FL', postal_code='32401'),
 Row(transaction_id='trx-504070566513', units='4', payment_method='PayPal', discount_perc='0', total_price=119.96, transaction_date=datetime.date(2022, 7, 27), customer_sur_key=1623497637888, product_sur_key=60129542144, date_sur_key='2772022', street='4738 Mallard Common', city='Fremont', state='CA', postal_code='94555'),
 Row(transactio