In [54]:
import re
from pyspark.sql import SparkSession
from functools import reduce
from pyspark.sql.functions import col, regexp_replace, trim,when ,monotonically_increasing_id,lit,year, month, dayofmonth, weekofyear, dayofweek, date_format,floor,dense_rank,\
substring,concat,split
from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
from datetime import date, datetime, timedelta
import subprocess
from py4j.java_gateway import java_import
import os
from pyspark.sql.types import DateType

ImportError: cannot import name 'isEmpty'

In [2]:
spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("Gold_layer_transformations")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .getOrCreate()

sc = spark.sparkContext

In [3]:
now = datetime.now()
date_str = now.strftime("%Y%m%d")
hour_str = now.strftime("%H")

In [4]:
input_trans = spark.read.csv(f"hdfs:///data/retail_silver/20240706/21/sales_transactions_SS_cleaned_20240706_21.csv", header='true')
#input = spark.read.csv(f"hdfs:///data/retail_silver/{date_str}/12/sales_transactions_SS_cleaned_{date_str}_12.csv", header='true')
input_trans.show(5)

+----------------+----------------+-----------+--------------+--------------+--------------+---------+----------+------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+----+------+--------------------+-------------+
|transaction_date|  transaction_id|customer_id|customer_fname|cusomter_lname|sales_agent_id|branch_id|product_id|product_name|product_category|offer_1|offer_2|offer_3|offer_4|offer_5|units|unit_price|is_online|payment_method|shipping_address|logs|source|      customer_email|discount_perc|
+----------------+----------------+-----------+--------------+--------------+--------------+---------+----------+------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+----+------+--------------------+-------------+
|       2022-7-19|trx-878108770002|      85513|     Alexander|       Johnson|            10|        6|        27|        Iron|    

In [5]:
print(input_trans.columns)


['transaction_date', 'transaction_id', 'customer_id', 'customer_fname', 'cusomter_lname', 'sales_agent_id', 'branch_id', 'product_id', 'product_name', 'product_category', 'offer_1', 'offer_2', 'offer_3', 'offer_4', 'offer_5', 'units', 'unit_price', 'is_online', 'payment_method', 'shipping_address', 'logs', 'source', 'customer_email', 'discount_perc']


In [6]:
#function_to_rename_in_hdfs
def rename_in_hdfs(golden_layer_path,file_extension,name):
    # Run the Hadoop fs -ls command to list files
    list_files_process = subprocess.run(["hadoop", "fs", "-ls", golden_layer_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Check for errors
    if list_files_process.returncode != 0:
        print(f"Error listing files in {golden_layer_path}: {list_files_process.stderr.decode()}")
        exit(1)

    # Decode stdout to string format and split lines
    stdout_str = list_files_process.stdout.decode()
    file_list = stdout_str.splitlines()

    # Find the file to rename based on criteria
    file_to_rename = None
    for line in file_list:
        if line.endswith(file_extension):
            file_to_rename = line.split()[-1].strip()
            break

    # Check if a file matching the criteria was found
    if file_to_rename:
        #new_directory = f"{golden_layer_path}/{name}"
        new_filename = f"{golden_layer_path}/{name}{file_extension}"

        # Create directory if it doesn't exist
        #subprocess.run(["hadoop", "fs", "-mkdir", "-p", new_directory])

        # Move (rename) the file
        subprocess.run(["hadoop", "fs", "-mv", file_to_rename, new_filename])

        print(f"File moved and renamed to: {new_filename}")
    else:
        print("File matching the criteria not found.")

In [31]:
def check_if_exists(path):
    jvm = spark._jvm
    jsc = spark._jsc
    fs = jvm.org.apache.hadoop.fs.FileSystem.get(jsc.hadoopConfiguration())
    return fs.exists(jvm.org.apache.hadoop.fs.Path(path))


In [16]:
#write customer dim in HDFS

cust_dim = input_trans.dropDuplicates(['customer_id'])
# Add monotonically increasing id column
cust_dim = cust_dim.withColumn('customer_sur_key', monotonically_increasing_id())

#to write cust_dim in one file 
cust_dim=cust_dim.repartition(1)
golden_layer_path="hdfs:///data/golden_layer/cust_dim"
file_extension = ".csv"
name='cust_dim'

#make customer dim 
cust_dim=cust_dim.select('customer_sur_key','customer_id', 'customer_fname', 'cusomter_lname', 'customer_email') 
cust_dim.write.mode('overwrite') \
        .option("header", "true") \
        .format('csv') \
        .save(golden_layer_path)

# Define your directory path and file criteria
#directory_path = customer_dim_path
#golden_layer_path="hdfs:///data/golden_layer"
file_extension = ".csv"
name='cust_dim'
rename_in_hdfs(golden_layer_path,file_extension,name)
cust_dim.take(5)

File moved and renamed to: hdfs:///data/golden_layer/cust_dim/cust_dim.csv


[Row(customer_sur_key=42949672960, customer_id='85485', customer_fname='Emma', cusomter_lname='Williams', customer_email='emma.williams@hotmail.com'),
 Row(customer_sur_key=51539607552, customer_id='85529', customer_fname='Mia', cusomter_lname='Williams', customer_email='mia.williams@yahoo.com'),
 Row(customer_sur_key=68719476736, customer_id='85509', customer_fname='Emma', cusomter_lname='Williams', customer_email='emma.williams@gmail.com'),
 Row(customer_sur_key=85899345920, customer_id='85547', customer_fname='Olivia', cusomter_lname='Smith', customer_email='olivia.smith@outlook.com'),
 Row(customer_sur_key=120259084288, customer_id='85476', customer_fname='Alexander', cusomter_lname='Brown', customer_email='alexander.brown@outlook.com')]

In [8]:
#product_dim
product_dim=input_trans.dropDuplicates(['product_id'])
product_dim = product_dim.withColumn('product_sur_key', monotonically_increasing_id())
#to write product_dim in one file 
product_dim=product_dim.repartition(1)


golden_layer_path="hdfs:///data/golden_layer/prodcut_dim"
file_extension = ".csv"
name='product_dim'

#make customer dim 
product_dim=product_dim.select('product_sur_key','product_id', 'product_name', 'product_category') 
product_dim.write.mode('overwrite') \
            .option("header", "true") \
            .format('csv') \
            .save(golden_layer_path)

rename_in_hdfs(golden_layer_path,file_extension,name)



File moved and renamed to: hdfs:///data/golden_layer/prodcut_dim/product_dim.csv


In [11]:
#branches Dim
# Define the file path for the initial CSV data and the golden layer path on HDFS
file_path = f"hdfs:///data/retail_bronze/{date_str}/{hour_str}/branches_SS_raw_{date_str}__{date_str}_{hour_str}.csv"
golden_layer_path = "hdfs:///data/golden_layer/branches_dim"
manual_path=f"hdfs:///data/retail_bronze/20240706/12/branches_SS_raw_20240706_12.csv"
file_extension = ".csv"
name="branches_dim"
# Create the "golden layer" directory on HDFS if it doesn't exist
subprocess.run(["hadoop", "fs", "-mkdir", "-p", golden_layer_path])

# Load the CSV data into a PySpark DataFrame
branches_dim = spark.read.option("header", "true").csv(manual_path)

# Convert establish_date to date type if needed
branches_dim = branches_dim.withColumn("establish_date", col("establish_date").cast("date"))

# Add a surrogate key column
branches_dim = branches_dim.withColumn("surrogate_key", monotonically_increasing_id())

# Show the DataFrame with the surrogate key
branches_dim.show()

# Save the DataFrame to the golden layer folder on HDFS in CSV format
branches_dim.write.option("header", "true").mode("overwrite").csv(golden_layer_path)
rename_in_hdfs(golden_layer_path,file_extension,name)

+---------+-----------+--------------+-----+-------------+
|branch_id|   location|establish_date|class|surrogate_key|
+---------+-----------+--------------+-----+-------------+
|        1|   New York|    2017-01-15|    A|            0|
|        2|Los Angeles|    2016-07-28|    B|            1|
|        3|    Chicago|    2015-03-10|    A|            2|
|        4|    Houston|    2016-11-05|    D|            3|
|        5|    Phoenix|    2017-09-20|    C|            4|
|        6|   Oklahoma|    2016-09-20|    A|            5|
+---------+-----------+--------------+-----+-------------+

File moved and renamed to: hdfs:///data/golden_layer/branches_dim/branches_dim.csv


In [12]:
#agent dim
# Define the file path for the initial CSV data and the golden layer path on HDFS
file_path = f"hdfs:///data/retail_bronze/{date_str}/{hour_str}/sales_agents_SS_raw_{date_str}__{date_str}_{hour_str}.csv"
golden_layer_path = "hdfs:///data/golden_layer/agent_DIM"
manual_path="/data/retail_bronze/20240706/12/sales_agents_SS_raw_20240706_12.csv"
file_extension = ".csv"
name="agent"
# Create the "golden layer" directory on HDFS if it doesn't exist
subprocess.run(["hadoop", "fs", "-mkdir", "-p", golden_layer_path])

# Load the CSV data into a PySpark DataFrame
agent_dim = spark.read.option("header", "true").csv(manual_path)

# Convert establish_date to date type if needed
agent_dim = agent_dim.withColumn("hire_date", col("hire_date").cast("date"))

# Add a surrogate key column
agent_dim = agent_dim.withColumn("surrogate_key", monotonically_increasing_id())
agent_dim.write.option("header", "true").mode("overwrite").csv(golden_layer_path)


# Show the DataFrame with the surrogate key
agent_dim.show()

rename_in_hdfs(golden_layer_path,file_extension,name)



+---------------+------------------+----------+-------------+
|sales_person_id|              name| hire_date|surrogate_key|
+---------------+------------------+----------+-------------+
|              1|          John Doe|2020-06-10|            0|
|              2|        Jane Smith|2021-06-08|            1|
|              3|   Michael Johnson|2019-07-22|            2|
|              4|       Emily Brown|2018-11-12|            3|
|              5|      David Wilson|2020-06-23|            4|
|              6|       Emma Taylor|2018-08-09|            5|
|              7|Christopher Miller|2018-07-05|            6|
|              8|      Olivia Davis|2019-12-08|            7|
|              9|   Daniel Martinez|2019-07-19|            8|
|             10|      Sophia Moore|2019-11-10|            9|
|             11|         john wick|2018-07-10|           10|
+---------------+------------------+----------+-------------+

File moved and renamed to: hdfs:///data/golden_layer/agent_DIM/agent.

In [13]:
#create date dimension
# Generate date range

start_date = date(2022, 1, 1)
end_date = date(2024, 12, 31)

date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
date_df = spark.createDataFrame([(d,) for d in date_range], ["date"]).withColumn("date", col("date").cast(DateType()))

# Add date attributes
date_dim = date_df.withColumn("year", year(col("date"))) \
    .withColumn("month", month(col("date"))) \
    .withColumn("day", dayofmonth(col("date"))) \
    .withColumn("week", weekofyear(col("date"))) \
    .withColumn("weekday", dayofweek(col("date"))) \
    .withColumn("quarter", floor((month(col("date")) - 1) / 3) + 1) \
    .withColumn("day_name", date_format(col("date"), "EEEE")) \
    .withColumn("month_name", date_format(col("date"), "MMMM")) \
    .withColumn("is_weekend", when(col("weekday").isin([1, 7]), lit(1)).otherwise(lit(0)))

# Add surrogate key column
date_dim = date_dim.withColumn("date_sur_key", concat(col('day'),col('month'),col('year')))


# Define the output directory for the date dimension
date_dim_path = "hdfs:///data/golden_layer/date_dim"

try:
    # Write the date dimension to a single CSV file
    date_dim.repartition(1) \
        .write.mode('overwrite') \
        .option("header", "true") \
        .format('csv') \
        .save(date_dim_path)
    print(f"Date dimension table saved to {date_dim_path}")
except Exception as e:
    print(f"An error occurred: {e}")
    
#to rename csv file in date dim
#date_dim_path="hdfs:///data/golden_layer"
file_extension = ".csv"
name="date_dim"
rename_in_hdfs(date_dim_path,file_extension,name)

Date dimension table saved to hdfs:///data/golden_layer/date_dim
File moved and renamed to: hdfs:///data/golden_layer/date_dim/date_dim.csv


In [None]:
#try to create fact table 
#fact One (offline)
#print(input.columns)
offline_fact=input_trans.filter(col('is_online')=="no")
columns_to_drop=['shipping_address','customer_fname','cusomter_lname','sales_agent_id','offer_1','offer_2',
                'offer_3','offer_4','offer_5','product_name','product_category','customer_email']
offline_fact=offline_fact.drop(*columns_to_drop)
#print(offline_fact.columns)
#print(offline_fact.take(5))
#print(cust_dim.columns)
offline_fact=offline_fact.withColumn("transaction_date", col("transaction_date").cast(DateType()))
final_price=(col('units') * col('unit_price') * (1 - col('discount_perc') / 100))
offline_fact=offline_fact.withColumn("total_price",final_price)
offline_fact=cust_dim.join(offline_fact,on='customer_id',how='left').join(product_dim,on='product_id',how='right').\
                    join(date_dim,date_dim.date == offline_fact.transaction_date,'left')
#offline_fact=product_dim.join(offline_fact,on='product_id',how='left')
offline_fact
offline_fact = offline_fact.select(
    'transaction_id',
    'units',
    'payment_method',
    'discount_perc',
    'total_price',
    'transaction_date',     
    'customer_sur_key',
    'product_sur_key',
    'date_sur_key'
)

fact_off_dim_path="hdfs:///data/golden_layer/offline_fact"
name ="offline_fact"
extension = ".csv"
full_file_path = f"{fact_off_dim_path}/{name}{extension}"
if check_if_exists(full_file_path):
    old_df = spark.read.csv(full_file_path, header = 'true')
    unioned_offline_fact_df = old_df.union(offline_fact)
    new_offline_fact = unioned_offline_fact_df.dropDuplicates(['transaction_id'])
    if new_offline_fact.rdd.isEmpty() == False:
        new_offline_fact=new_offline_fact.repartition(1)
        new_offline_fact.write.mode('overwrite') \
                    .option("header", "true") \
                    .format('csv') \
                    .save(f"{fact_off_dim_path}/tmp")
        rename_in_hdfs(f"{fact_off_dim_path}/tmp",extension,name) 
        subprocess.run(["hadoop", "fs", "-rm", full_file_path])
        subprocess.run(["hadoop", "fs", "-mv", f"{fact_off_dim_path}/tmp/{name}{extension}" ,fact_off_dim_path])
        print("done")
        
    
else:
    offline_fact=offline_fact.repartition(1)
    offline_fact.write.mode('overwrite') \
                .option("header", "true") \
                .format('csv') \
                .save(fact_off_dim_path)

    rename_in_hdfs(fact_off_dim_path,extension,name)

In [None]:
#online_fact
online_fact=input_trans.filter(col('is_online')=="yes")
columns_to_drop=['customer_fname','cusomter_lname','sales_agent_id','offer_1','offer_2',
                'offer_3','offer_4','offer_5','product_name','product_category','customer_email']
online_fact=online_fact.drop(*columns_to_drop)
#print(online_fact.columns)
#cast transaction_date to date type 
online_fact=online_fact.withColumn("transaction_date", col("transaction_date").cast(DateType()))
#calculate the final price 
final_price=(col('units') * col('unit_price') * (1 - col('discount_perc') / 100))
online_fact=online_fact.withColumn("total_price",final_price)
#processing address column
split_address_col=split(col("shipping_address"),'/')
online_fact=online_fact.withColumn('street',split_address_col.getItem(0))\
                        .withColumn('city',split_address_col.getItem(1))\
                        .withColumn('state',split_address_col.getItem(2))\
                        .withColumn('postal_code',split_address_col.getItem(3))
online_fact
#join dataframe togther 
online_fact=cust_dim.join(online_fact,on='customer_id',how='left').join(product_dim,on='product_id',how='right').\
                    join(date_dim,date_dim.date == online_fact.transaction_date,'left')
online_fact = online_fact.select(
    'transaction_id',
    'units',
    'payment_method',
    'discount_perc',
    'total_price',
    'transaction_date',     
    'customer_sur_key',
    'product_sur_key',
    'date_sur_key',
    'street',
    'city',
    'state',
    'postal_code'
)
print(online_fact.columns)
online_fact_path="hdfs:///data/golden_layer/online_fact"
name="online_fact"
extension = ".csv"
print(check_if_exists(f"{online_fact}/{name}/{extension}"))
online_fact=online_fact.repartition(1)
online_fact.write.mode('overwrite') \
            .option("header", "true") \
            .format('parquet') \
            .save(online_fact_path)

rename_in_hdfs(online_fact_path,extension,name)
online_fact.take(5)

['transaction_id', 'units', 'payment_method', 'discount_perc', 'total_price', 'transaction_date', 'customer_sur_key', 'product_sur_key', 'date_sur_key', 'street', 'city', 'state', 'postal_code']
False
File matching the criteria not found.


[Row(transaction_id='trx-273956652971', units='8', payment_method='Stripe', discount_perc='15', total_price=407.932, transaction_date=datetime.date(2022, 3, 28), customer_sur_key=1348619730944, product_sur_key=25769803776, date_sur_key='2832022', street='12 Netherclift Way', city='Savannah', state='GA', postal_code='31411'),
 Row(transaction_id='trx-916643027495', units='6', payment_method='PayPal', discount_perc='5', total_price=170.94299999999998, transaction_date=datetime.date(2022, 3, 28), customer_sur_key=1168231104512, product_sur_key=60129542144, date_sur_key='2832022', street='40 Strawberry Lane', city='Manchester', state='CT', postal_code='06040'),
 Row(transaction_id='trx-211274182982', units='7', payment_method='Stripe', discount_perc='5', total_price=199.43349999999998, transaction_date=datetime.date(2022, 3, 28), customer_sur_key=1133871366144, product_sur_key=60129542144, date_sur_key='2832022', street='5921 Ashwood Bluff Drive', city='Louisville', state='KY', postal_code