In [21]:
 import re
from pyspark.sql import SparkSession
from functools import reduce
from pyspark.sql.functions import col, regexp_replace, trim,when ,monotonically_increasing_id,lit,year, month, dayofmonth, weekofyear, dayofweek, date_format,floor,dense_rank,\
substring,concat,split, row_number,lpad,lit, current_date
from pyspark.sql.window import Window
from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
from datetime import date, datetime, timedelta
import subprocess
from py4j.java_gateway import java_import
import os
from pyspark.sql.types import DateType

In [2]:
spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("sales_transactions")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .enableHiveSupport()\
    .getOrCreate()

sc = spark.sparkContext

In [None]:
 spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
 spark.sql("set hive.enforce.bucketing=true")

In [3]:
now = datetime.now()
date_str = now.strftime("%Y%m%d")
hour_str = int(now.strftime("%H")) -3
print(date_str, hour_str)

20240711 18


In [4]:
input_df = spark.read.parquet(f"hdfs:///data/retail_silver/{date_str}/{hour_str}/sales_transactions_SS_cleaned_{date_str}_{hour_str}.parquet")
input_df.show(5)

+----------------+----------------+-----------+--------------+--------------+--------------+---------+----------+---------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+--------------------+-------------+
|transaction_date|  transaction_id|customer_id|customer_fname|customer_lname|sales_agent_id|branch_id|product_id|   product_name|product_category|offer_1|offer_2|offer_3|offer_4|offer_5|units|unit_price|is_online|payment_method|shipping_address|      customer_email|discount_perc|
+----------------+----------------+-----------+--------------+--------------+--------------+---------+----------+---------------+----------------+-------+-------+-------+-------+-------+-----+----------+---------+--------------+----------------+--------------------+-------------+
|      2023-10-18|trx-237976222990|      85517|           Mia|         Jones|           6.0|      1.0|        10|        Sandals|        Footwear|   null|   

In [5]:
input_df.printSchema()

root
 |-- transaction_date: string (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- customer_lname: string (nullable = true)
 |-- sales_agent_id: double (nullable = true)
 |-- branch_id: double (nullable = true)
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- offer_1: boolean (nullable = true)
 |-- offer_2: boolean (nullable = true)
 |-- offer_3: boolean (nullable = true)
 |-- offer_4: boolean (nullable = true)
 |-- offer_5: boolean (nullable = true)
 |-- units: long (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- is_online: string (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- shipping_address: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- discount_perc: integer (nullable = true)



In [6]:
#function_to_rename_in_hdfs
def rename_in_hdfs(golden_layer_path,file_extension,name):
    # Run the Hadoop fs -ls command to list files
    list_files_process = subprocess.run(["hadoop", "fs", "-ls", golden_layer_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Check for errors
    if list_files_process.returncode != 0:
        print(f"Error listing files in {golden_layer_path}: {list_files_process.stderr.decode()}")
        exit(1)

    # Decode stdout to string format and split lines
    stdout_str = list_files_process.stdout.decode()
    file_list = stdout_str.splitlines()

    # Find the file to rename based on criteria
    file_to_rename = None
    for line in file_list:
        if line.endswith(file_extension):
            file_to_rename = line.split()[-1].strip()
            break

    # Check if a file matching the criteria was found
    if file_to_rename:
        new_filename = f"{golden_layer_path}/{name}{file_extension}"

        # Move (rename) the file
        subprocess.run(["hadoop", "fs", "-mv", file_to_rename, new_filename])

        print(f"File moved and renamed to: {new_filename}")
    else:
        print("File matching the criteria not found.")

In [7]:
# Function to check if a file exists in HDFS
def check_if_exists(path):
    jvm = spark._jvm
    jsc = spark._jsc
    fs = jvm.org.apache.hadoop.fs.FileSystem.get(jsc.hadoopConfiguration())
    return fs.exists(jvm.org.apache.hadoop.fs.Path(path))

A function that writes a df to hive external_table

In [8]:
def write_df_to_table(database_name, table_name, df, partition_columns=None):
    # Temporary table name
    temp_table = "temp_table"

    # Create DataFrameWriter with overwrite mode
    writer = df.write.mode("overwrite")

    # If partition columns are provided, specify them in the writer
    if partition_columns:
        if isinstance(partition_columns, list):
            writer = writer.partitionBy(*partition_columns)
        else:
            writer = writer.partitionBy(partition_columns)

    # Save the DataFrame to a temporary table
    writer.saveAsTable(temp_table)

    # Construct the insert overwrite query
    if partition_columns:
        if isinstance(partition_columns, list):
            partition_str = ", ".join(partition_columns)
        else:
            partition_str = partition_columns
    else:
        partition_str = ""

    insert_query = f"""
         INSERT INTO TABLE {database_name}.{table_name}
         {f'PARTITION ({partition_str})' if partition_str else ''}
         SELECT * FROM {temp_table}
    """

    try:
        # Execute the insert overwrite query
        spark.sql(insert_query)

        # Drop the temporary table after use
        subprocess.run(["hadoop", "fs", "-rm", "-r", '/user/hive/warehouse/temp_table'])
        print(f"Data written successfully to {database_name}.{table_name}")
    except Exception as e:
        subprocess.run(["hadoop", "fs", "-rm", "-r", '/user/hive/warehouse/temp_table'])
        print(f"Error in writing to {table_name}: {e}")


In [9]:
#write customer dim in HDFS
cust_data = input_df.select('customer_id', 'customer_fname', 'customer_lname', 'customer_email')
cust_data = cust_data.dropDuplicates(['customer_id'])

golden_layer_path="hdfs:///data/retail_gold/customer_dim"
file_extension = ".parquet"
name='customer_dim'
path_to_check = f"hdfs:///data/retail_gold/customer_dim/{name}{file_extension}"

if check_if_exists(golden_layer_path):
    cust_dim = spark.read.parquet("/data/retail_gold/customer_dim/customer_dim.parquet")
    existing_cust_dim_without_sk = cust_dim.select('customer_id', 'customer_fname', 'customer_lname', 'customer_email')
    new_customers_data = cust_data.subtract(existing_cust_dim_without_sk)
    print(new_customers_data)
    
    # Get the maximum surrogate key from existing data
    max_sur_key = cust_dim.agg({"customer_sur_key": "max"}).collect()[0][0]
    print(max_sur_key)
    
    
    # Combine existing data with new data
    if new_customers_data.rdd.isEmpty() == False:
        window_spec = Window.orderBy("customer_id")
       # Add surrogate keys to new data starting from max_sur_key + 1
        customers_dim = new_customers_data.withColumn('customer_sur_key', (row_number().over(window_spec) + max_sur_key).cast("int"))
        customers_dim_sk = customers_dim.select('customer_sur_key','customer_id', 'customer_fname', 'customer_lname', 'customer_email')
        
        updated_customers_dim = cust_dim.union(customers_dim_sk)

        updated_customers_dim = updated_customers_dim.repartition(1)
            
        updated_customers_dim.write.mode('overwrite') \
                            .option("header", "true") \
                            .format('parquet') \
                            .save(f"{golden_layer_path}/tmp")

        rename_in_hdfs(f"{golden_layer_path}/tmp", file_extension, name) 
        subprocess.run(["hadoop", "fs", "-rm", path_to_check])
        subprocess.run(["hadoop", "fs", "-mv", f"{golden_layer_path}/tmp/{name}{file_extension}" ,golden_layer_path])
        write_df_to_table('retail_DWH', 'Customer_Dim', customers_dim_sk, bucket_column=['customer_sur_key'], num_buckets=6)
        print("done")
    else:
        updated_customers_dim = cust_dim
        print("No new Data")
else:
    # Add a sequential surrogate key column
    window_spec = Window.orderBy("customer_id")
    updated_customers_dim = cust_data.withColumn('customer_sur_key', row_number().over(window_spec))

    #to write cust_dim in one file 
    updated_customers_dim = updated_customers_dim.repartition(1)

    #make customer dim 
    updated_customers_dim = updated_customers_dim.select('customer_sur_key','customer_id', 'customer_fname', 'customer_lname', 'customer_email') 
    updated_customers_dim.write.mode('overwrite') \
            .option("header", "true") \
            .format('parquet') \
            .save(golden_layer_path)
    updated_customers_dim.show(5)
    write_df_to_table('retail_DWH', 'Customer_Dim', updated_customers_dim, bucket_column=['customer_sur_key'], num_buckets=6)
    rename_in_hdfs(golden_layer_path,file_extension,name)

+-----------+--------------+--------------+--------------+
|customer_id|customer_fname|customer_lname|customer_email|
+-----------+--------------+--------------+--------------+
+-----------+--------------+--------------+--------------+

202
No new Data


In [10]:
# Ensure product_dim is distinct by product_id and add a sequential surrogate key
# Select relevant columns for product_dim
product_dim = input_df.select('product_id', 'product_name', 'product_category')

# Drop duplicates based on product_id if necessary
product_dim = product_dim.dropDuplicates(['product_id'])

# Define the golden layer path and file details
golden_layer_path = "hdfs:///data/retail_gold/product_dim"
file_extension = ".parquet"
name = "product_dim"
path_to_check = f"hdfs:///data/retail_gold/product_dim/{name}{file_extension}"

if check_if_exists(path_to_check):
    existing_product_dim = spark.read.parquet(path_to_check)
    existing_product_dim = existing_product_dim.withColumn("product_sur_key", col("product_sur_key").cast("int"))
    existing_product_dim_without_sk = existing_product_dim.select('product_id', 'product_name', 'product_category')
    
    new_products_data = product_dim.subtract(existing_product_dim_without_sk)
    print(new_products_data)
    
    # Get the maximum surrogate key from existing data
    max_sur_key = existing_product_dim.agg({"product_sur_key": "max"}).collect()[0][0]
    print(max_sur_key)
    
    # Combine existing data with new data
    if new_products_data.rdd.isEmpty() == False:
        window_spec = Window.orderBy("product_id")
        product_dim = new_products_data.withColumn('product_sur_key', (row_number().over(window_spec) + max_sur_key).cast("int"))
        product_dim_sk = product_dim.select('product_sur_key','product_id', 'product_name', 'product_category')
        updated_product_dim = existing_product_dim.union(product_dim_sk)
        updated_product_dim = updated_product_dim.repartition(1)
        updated_product_dim.write.mode('overwrite') \
                    .option("header", "true") \
                    .format('parquet') \
                    .save(f"{golden_layer_path}/tmp")
        
        rename_in_hdfs(f"{golden_layer_path}/tmp", file_extension, name) 
        subprocess.run(["hadoop", "fs", "-rm", path_to_check])
        subprocess.run(["hadoop", "fs", "-mv", f"{golden_layer_path}/tmp/{name}{file_extension}" ,golden_layer_path])
        write_df_to_table('retail_DWH', 'Product_Dim', product_dim_sk, bucket_column=['product_sur_key'], num_buckets=3)
    else:
        updated_product_dim = existing_product_dim
        print("No new Data")
else:
    # Add a sequential surrogate key column
    window_spec = Window.orderBy("product_id")
    product_dim = product_dim.withColumn('product_sur_key', row_number().over(window_spec))
    updated_product_dim = product_dim

    updated_product_dim = updated_product_dim.select('product_sur_key','product_id', 'product_name', 'product_category')

    updated_product_dim.show()
    
    # Repartition to one file for efficient writing
    product_dim = product_dim.repartition(1)

    # Write the updated data back to HDFS
    updated_product_dim.write.mode('overwrite') \
        .option("header", "true") \
        .format('parquet') \
        .save(golden_layer_path)
    
    # Rename the file in HDFS if necessary
    rename_in_hdfs(golden_layer_path, file_extension,name)    
    
    write_df_to_table('retail_DWH', 'Product_Dim', updated_product_dim, bucket_column=['product_sur_key'], num_buckets=3)


+----------+------------+----------------+
|product_id|product_name|product_category|
+----------+------------+----------------+
+----------+------------+----------------+

30
No new Data


In [11]:
# Define the file path for the initial CSV data and the golden layer path on HDFS
file_path = f"hdfs:///data/retail_bronze/{date_str}/{hour_str}/branches_SS_raw_{date_str}_{hour_str}.parquet"
golden_layer_path = "hdfs:///data/retail_gold/branches_dim"
file_extension = ".parquet"
name = "branches_dim"
path_to_check = f"hdfs:///data/retail_gold/branches_dim/{name}{file_extension}"

# Load the CSV data into a PySpark DataFrame
branches_dim = spark.read.parquet(file_path)

# Convert establish_date to date type if needed
branches_dim = branches_dim.withColumn("establish_date", col("establish_date").cast("date"))
print(branches_dim)

# Drop duplicates based on branch_id if necessary
branches_dim = branches_dim.dropDuplicates(['branch_id'])

if check_if_exists(path_to_check):
    existing_branch_dim = spark.read.parquet(path_to_check)
    existing_branch_dim_without_sk = existing_branch_dim.select('branch_id', 'location', 'establish_date', 'class')
    
    new_branches_data = branches_dim.subtract(existing_branch_dim_without_sk)
    print(new_branches_data)
    
    # Get the maximum surrogate key from existing data
    max_sur_key = existing_branch_dim.agg({"branch_sur_key": "max"}).collect()[0][0]
    print(max_sur_key)
    # Add surrogate keys to new data starting from max_sur_key + 1

    
    # Combine existing data with new data
    
    if new_branches_data.rdd.isEmpty() == False:
        window_spec = Window.orderBy("branch_id")
        branches_dim = new_branches_data.withColumn('branch_sur_key', (row_number().over(window_spec) + max_sur_key).cast("int"))
        branches_dim_sk = branches_dim.select('branch_sur_key','branch_id', 'location', 'establish_date', 'class')
        
        updated_branches_dim = existing_branch_dim.union(branches_dim_sk)
        updated_branches_dim = updated_branches_dim.repartition(1)
        updated_branches_dim.write.mode('overwrite') \
                    .option("header", "true") \
                    .format('parquet') \
                    .save(f"{golden_layer_path}/tmp")
        
        rename_in_hdfs(f"{golden_layer_path}/tmp", file_extension, name) 
        subprocess.run(["hadoop", "fs", "-rm", path_to_check])
        subprocess.run(["hadoop", "fs", "-mv", f"{golden_layer_path}/tmp/{name}{file_extension}" ,golden_layer_path])
        write_df_to_table('retail_DWH', 'Branches_Dim', branches_dim_sk)
    else:
        updated_branches_dim = existing_branch_dim
        print("No new Data")
else:
    # Add a sequential surrogate key column
    window_spec = Window.orderBy("branch_id")
    branches_dim = branches_dim.withColumn('branch_sur_key', row_number().over(window_spec))
    updated_branches_dim = branches_dim

    updated_branches_dim = updated_branches_dim.select('branch_sur_key', 'branch_id', 'location', 'establish_date', 'class')

    updated_branches_dim.show()
    write_df_to_table('retail_DWH', 'Branches_Dim', updated_branches_dim)
    # Write the updated data back to HDFS
    updated_branches_dim.write.mode('overwrite') \
        .option("header", "true") \
        .format('parquet') \
        .save(golden_layer_path)

    rename_in_hdfs(golden_layer_path, file_extension, name)
    


+---------+-----------+--------------+-----+
|branch_id|   location|establish_date|class|
+---------+-----------+--------------+-----+
|        1|   New York|    2017-01-15|    A|
|        2|Los Angeles|    2016-07-28|    B|
|        3|    Chicago|    2015-03-10|    A|
|        4|    Houston|    2016-11-05|    D|
|        5|    Phoenix|    2017-09-20|    C|
|        6|   Oklahoma|    2016-09-20|    A|
+---------+-----------+--------------+-----+

+---------+--------+--------------+-----+
|branch_id|location|establish_date|class|
+---------+--------+--------------+-----+
+---------+--------+--------------+-----+

6
No new Data


In [12]:
# Define the file path for the initial CSV data and the golden layer path on HDFS
file_path = f"hdfs:////data/retail_bronze/{date_str}/{hour_str}/sales_agents_SS_raw_{date_str}_{hour_str}.parquet"
golden_layer_path = "hdfs:///data/retail_gold/sales_agent_dim"
file_extension = ".parquet"
name = "sales_agent_dim"
path_to_check = f"hdfs:///data/retail_gold/sales_agent_dim/{name}{file_extension}"

# Load the CSV data into a PySpark DataFrame
agent_dim = spark.read.parquet(file_path)

# Convert hire_date to date type if needed
agent_dim = agent_dim.withColumn("hire_date", col("hire_date").cast("date"))

# Drop duplicates based on sales_person_id if necessary
agent_dim = agent_dim.dropDuplicates(['sales_person_id'])

if check_if_exists(path_to_check):
    existing_agent_dim = spark.read.parquet(path_to_check)
    existing_agent_dim = existing_agent_dim.withColumn('sales_agent_sur_key', col("sales_agent_sur_key").cast("int"))
    existing_agent_dim_without_sk = existing_agent_dim.select('sales_person_id', 'name', 'hire_date')
    
    new_sales_agent_data = agent_dim.subtract(existing_agent_dim_without_sk)
    print(existing_agent_dim)
    
    # Get the maximum surrogate key from existing data
    max_sur_key = existing_agent_dim.agg({"sales_agent_sur_key": "max"}).collect()[0][0]
    print(max_sur_key)
    
    if not new_sales_agent_data.rdd.isEmpty():
        # Add surrogate keys to new data starting from max_sur_key + 1
        window_spec = Window.orderBy("sales_person_id")
        agent_dim_with_sk = new_sales_agent_data.withColumn('sales_agent_sur_key', (row_number().over(window_spec) + max_sur_key).cast("int"))
        
        # Combine existing data with new data
        updated_agent_dim = existing_agent_dim.union(agent_dim_with_sk)
        updated_agent_dim = updated_agent_dim.repartition(1)
        
        # Write the updated data back to HDFS
        updated_agent_dim.write.mode('overwrite') \
            .option("header", "true") \
            .format('parquet') \
            .save(f"{golden_layer_path}/tmp")
        
        rename_in_hdfs(f"{golden_layer_path}/tmp", file_extension, name) 
        subprocess.run(["hadoop", "fs", "-rm", path_to_check])
        subprocess.run(["hadoop", "fs", "-mv", f"{golden_layer_path}/tmp/{name}{file_extension}", golden_layer_path])
        write_df_to_table('retail_DWH', 'sales_agents_Dim', agent_dim_with_sk)
    else:
        updated_agent_dim = existing_agent_dim
        print("No new Data")
else:
    # Add a sequential surrogate key column
    window_spec = Window.orderBy("sales_person_id")
    agent_dim_with_sk = agent_dim.withColumn('sales_agent_sur_key', row_number().over(window_spec))
    
    updated_agent_dim = agent_dim_with_sk
    updated_agent_dim = updated_agent_dim.select('sales_agent_sur_key', 'sales_person_id', 'name', 'hire_date')
    updated_agent_dim.show()
    
    # Write the updated data back to HDFS
    updated_agent_dim.write.mode('overwrite') \
        .option("header", "true") \
        .format('parquet') \
        .save(golden_layer_path)
    
    rename_in_hdfs(golden_layer_path, file_extension, name)
    
    write_df_to_table('retail_DWH', 'sales_agents_Dim', updated_agent_dim)

+-------------------+---------------+------------------+----------+
|sales_agent_sur_key|sales_person_id|              name| hire_date|
+-------------------+---------------+------------------+----------+
|                  1|              1|          John Doe|2020-06-10|
|                  2|              2|        Jane Smith|2021-06-08|
|                  3|              3|   Michael Johnson|2019-07-22|
|                  4|              4|       Emily Brown|2018-11-12|
|                  5|              5|      David Wilson|2020-06-23|
|                  6|              6|       Emma Taylor|2018-08-09|
|                  7|              7|Christopher Miller|2018-07-05|
|                  8|              8|      Olivia Davis|2019-12-08|
|                  9|              9|   Daniel Martinez|2019-07-19|
|                 10|             10|      Sophia Moore|2019-11-10|
|                 11|             11|         john wick|2018-07-10|
+-------------------+---------------+-----------

In [13]:
#create date dimension
date_dim_path = "hdfs:///data/retail_gold/date_dim"
name = "date_dim"
extension = ".parquet"
if not check_if_exists(f"{date_dim_path}/{name}{extension}"):
    # Generate date range
    start_date = date(2012, 1, 1)
    end_date = date(2100, 12, 31)

    date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
    date_df = spark.createDataFrame([(d,) for d in date_range], ["datee"]).withColumn("datee", col("datee").cast("date"))

    # Add date attributes
    date_dim = date_df.withColumn("year", year(col("datee"))) \
        .withColumn("month", month(col("datee"))) \
        .withColumn("day", lpad(dayofmonth(col("datee")), 2, "0")) \
        .withColumn("week", weekofyear(col("datee"))) \
        .withColumn("weekday", dayofweek(col("datee"))) \
        .withColumn("quarter", floor((month(col("datee")) - 1) / 3) + 1) \
        .withColumn("day_name", date_format(col("datee"), "EEEE")) \
        .withColumn("month_name", date_format(col("datee"), "MMMM")) \
        .withColumn("is_weekend", when(col("weekday").isin([1, 7]), lit(1)).otherwise(lit(0)))

    # Add surrogate key column
    date_dim = date_dim.withColumn("date_sur_key",concat(col('year'), col('month'), col('day')).cast('long'))

    # Define the output directory for the date dimension
    date_dim_path = "hdfs:///data/retail_gold/date_dim"

    try:
        # Write the date dimension to a single CSV file
        date_dim.repartition(1) \
            .write.mode('overwrite') \
            .option("header", "true") \
            .format('parquet') \
            .save(date_dim_path)
        print(f"Date dimension table saved to {date_dim_path}")
        write_df_to_table('retail_DWH', 'date_dim', date_dim, ['year'])
    except Exception as e:
        print(f"An error occurred: {e}")

    # to rename csv file in date dim

    rename_in_hdfs(date_dim_path, extension, name)
else:
    date_dim = spark.read.parquet(f"{date_dim_path}/{name}{extension}")
    print("Date dimension already exists")

Date dimension already exists


In [19]:
spark.sql("""
    CREATE TABLE retail_dwh.date_dim_2 (
        datee DATE,
        month INT,
        day INT,
        week INT,
        weekday INT,
        quarter BIGINT,
        day_name STRING,
        month_name STRING,
        is_weekend INT,
        date_sur_key BIGINT
    )
    PARTITIONED BY (year int)
    STORED AS PARQUET
""")

In [34]:
 #fact One (offline)
#print(input.columns)
offline_fact=input_df.filter(col('is_online')=="no")
columns_to_drop=['shipping_address','customer_fname','cusomter_lname','offer_1','offer_2',
                'offer_3','offer_4','offer_5','product_name','product_category','customer_email']

offline_fact=offline_fact.drop(*columns_to_drop)
#print(offline_fact.columns)
#print(offline_fact.take(5))
#print(cust_dim.columns)
offline_fact=offline_fact.withColumn("transaction_date", col("transaction_date").cast(DateType()))
final_price=(col('units') * col('unit_price') * (1 - col('discount_perc') / 100))
offline_fact=offline_fact.withColumn("total_price",final_price)
offline_fact=offline_fact.join(updated_customers_dim, on='customer_id', how='left') \
                       .join(updated_product_dim, on='product_id', how='left') \
                       .join(date_dim, date_dim.datee == offline_fact.transaction_date, 'left') \
                       .join(updated_agent_dim, updated_agent_dim.sales_person_id == offline_fact.sales_agent_id, 'left') \
                       .join(updated_branches_dim, updated_branches_dim.branch_id == offline_fact.branch_id, 'left')

offline_fact = offline_fact.withColumn("insertion_date", date_format(lit(current_date()), "yyyyMMdd"))
        
offline_fact = offline_fact.select(
    'transaction_id',
    'branch_sur_key',
    'product_sur_key',
    'customer_sur_key',
    'sales_agent_sur_key',
    'date_sur_key',
    'units',
    'unit_price',
    'discount_perc',
    'total_price',
    'payment_method',
    'insertion_date'
)

fact_off_dim_path="hdfs:///data/retail_gold/offline_fact"
name ="offline_fact"
extension = ".parquet"
full_file_path = f"{fact_off_dim_path}/{name}{extension}"
if check_if_exists(full_file_path):
    old_df = spark.read.parquet(full_file_path)
    new_offline_fact = offline_fact.subtract(old_df)
    if new_offline_fact.rdd.isEmpty() == False:
        offline_fact = new_offline_fact.union(offline_fact)
        offline_fact=offline_fact.repartition(1)
        offline_fact.write.mode('overwrite') \
                    .option("header", "true") \
                    .format('parquet') \
                    .save(f"{fact_off_dim_path}/tmp")
        rename_in_hdfs(f"{fact_off_dim_path}/tmp",extension,name) 
        subprocess.run(["hadoop", "fs", "-rm", full_file_path])
        subprocess.run(["hadoop", "fs", "-mv", f"{fact_off_dim_path}/tmp/{name}{extension}" ,fact_off_dim_path])
        write_df_to_table('retail_DWH', 'branches_TRX_fact', new_offline_fact,['payment_method'])
        print("done")
    else:
        print("No new Data")
    
else:
    offline_fact=offline_fact.repartition(1)
    offline_fact.write.mode('overwrite') \
                .option("header", "true") \
                .format('parquet') \
                .save(fact_off_dim_path)

    rename_in_hdfs(fact_off_dim_path,extension,name)
    write_df_to_table('retail_DWH', 'branches_TRX_fact', offline_fact,['payment_method'])

File moved and renamed to: hdfs:///data/retail_gold/offline_fact/offline_fact.parquet
Data written successfully to retail_DWH.branches_TRX_fact


In [33]:
# online_fact
online_fact = input_df.filter(col('is_online')=="yes")

columns_to_drop = ['customer_fname','cusomter_lname','sales_agent_id','offer_1','offer_2',
                'offer_3','offer_4','offer_5','product_name','product_category','customer_email']

online_fact = online_fact.drop(*columns_to_drop)
# print(online_fact.columns)

# cast transaction_date to date type 
online_fact=online_fact.withColumn("transaction_date", col("transaction_date").cast(DateType()))

# calculate the final price 
final_price=(col('units') * col('unit_price') * (1 - (col('discount_perc') / 100)))
online_fact=online_fact.withColumn("total_price",final_price)

#processing address column
split_address_col=split(col("shipping_address"),'/')
online_fact=online_fact.withColumn('street',split_address_col.getItem(0))\
                        .withColumn('city',split_address_col.getItem(1))\
                        .withColumn('state',split_address_col.getItem(2))\
                        .withColumn('postal_code',split_address_col.getItem(3))

# Join with dimension tables using left join
online_fact = online_fact.join(updated_customers_dim, on='customer_id', how='left') \
                         .join(updated_product_dim, on='product_id', how='left') \
                         .join(date_dim, date_dim.datee == online_fact.transaction_date, 'left')
online_fact = online_fact.withColumn('insertion_date', date_format(lit(current_date()), "yyyyMMdd"))


online_fact = online_fact.select(
    'transaction_id',
    'units',
    'unit_price',
    'payment_method',
    'discount_perc',
    'total_price',
    'customer_sur_key',
    'product_sur_key',
    'date_sur_key',
    'street',
    'city',
    'state',
    'postal_code',
    'insertion_date'
)

online_fact_path="hdfs:///data/retail_gold/online_fact"
name="online_fact"
extension = ".parquet"
full_file_path = f"{online_fact_path}/{name}{extension}"

if check_if_exists(full_file_path):
    old_df = spark.read.parquet(full_file_path)
    new_online_fact = online_fact.subtract(old_df)
    if new_online_fact.rdd.isEmpty() == False:
        online_fact = new_online_fact.union(online_fact)
        online_fact=online_fact.repartition(1)
        online_fact.write.mode('overwrite') \
                    .option("header", "true") \
                    .format('parquet') \
                    .save(f"{online_fact_path}/tmp")
        rename_in_hdfs(f"{online_fact_path}/tmp",extension,name) 
        subprocess.run(["hadoop", "fs", "-rm", full_file_path])
        subprocess.run(["hadoop", "fs", "-mv", f"{online_fact_path}/tmp/{name}{extension}" ,online_fact_path])
        write_df_to_table('retail_DWH', 'online_TRX_fact', new_online_fact,['payment_method'])
        print("done")
        
    
else:
    online_fact=online_fact.repartition(1)
    online_fact.write.mode('overwrite') \
             .option("header", "true") \
             .format('parquet') \
             .save(online_fact_path)
    rename_in_hdfs(online_fact_path,extension,name)
    write_df_to_table('retail_DWH', 'online_TRX_fact', online_fact,['payment_method'])

File moved and renamed to: hdfs:///data/retail_gold/online_fact/online_fact.parquet
Data written successfully to retail_DWH.online_TRX_fact


In [56]:
if hour_str == 23:
    offline_fact = spark.read.parquet("hdfs:///data/retail_gold/offline_fact/offline_fact.parquet")
    daily_fact = offline_fact.filter(col('insertion_date') == date_str)
    daily_fact = daily_fact.join(updated_agent_dim, on ='sales_agent_sur_key' , how='left') \
                            .join(updated_product_dim, on='product_sur_key', how='left')
    daily_fact = daily_fact.select('name', 'product_name', 'units')
    daily_report = daily_fact.groupBy("name","product_name").agg({"units": "sum"})
    daily_report.show()
    daily_report = daily_report.coalesce(1)
    daily_report.write.option("header", "true").csv(f"file:///home/itversity/itversity-material/Retail_pipeline_project/Daily_report/report_{date_str}")

In [30]:
#updated_customers_dim.printSchema()

root
 |-- customer_sur_key: integer (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- cusomter_lname: string (nullable = true)
 |-- customer_email: string (nullable = true)



In [31]:
#updated_product_dim.printSchema()

root
 |-- product_sur_key: integer (nullable = true)
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_category: string (nullable = true)



In [39]:
#date_dim.printSchema()

root
 |-- date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- quarter: long (nullable = true)
 |-- day_name: string (nullable = true)
 |-- month_name: string (nullable = true)
 |-- is_weekend: integer (nullable = false)
 |-- date_sur_key: long (nullable = true)



In [27]:
#online_fact.printSchema()

root
 |-- transaction_id: string (nullable = true)
 |-- units: long (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- discount_perc: integer (nullable = true)
 |-- total_price: double (nullable = true)
 |-- customer_sur_key: integer (nullable = true)
 |-- product_sur_key: integer (nullable = true)
 |-- date_sur_key: long (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)



In [34]:
#offline_fact.printSchema()

root
 |-- transaction_id: string (nullable = true)
 |-- branch_sur_key: integer (nullable = true)
 |-- product_sur_key: integer (nullable = true)
 |-- customer_sur_key: integer (nullable = true)
 |-- sales_agent_sur_key: integer (nullable = true)
 |-- date_sur_key: string (nullable = true)
 |-- units: long (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- discount_perc: integer (nullable = true)
 |-- total_price: double (nullable = true)
 |-- payment_method: string (nullable = true)



In [35]:
#updated_agent_dim.printSchema()

root
 |-- sales_agent_sur_key: integer (nullable = true)
 |-- sales_person_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- hire_date: date (nullable = true)



In [36]:
#updated_branches_dim.printSchema()

root
 |-- branch_sur_key: integer (nullable = true)
 |-- branch_id: long (nullable = true)
 |-- location: string (nullable = true)
 |-- establish_date: date (nullable = true)
 |-- class: string (nullable = true)



In [50]:
#cust = spark.read.parquet("hdfs:///data/retail_gold/customer_dim/")
#cust.show()
#cust.printSchema()

+----------------+-----------+--------------+--------------+--------------------+
|customer_sur_key|customer_id|customer_fname|cusomter_lname|      customer_email|
+----------------+-----------+--------------+--------------+--------------------+
|               1|      85462|        Olivia|         Brown|olivia.brown@yaho...|
|               2|      85463|           Mia|      Williams|mia.williams@gmai...|
|               3|      85464|          Emma|      Williams|emma.williams@out...|
|               4|      85465|         James|        Taylor|james.taylor@gmai...|
|               5|      85466|       Michael|         Brown|michael.brown@yah...|
|               6|      85467|     Alexander|         Jones|alexander.jones@y...|
|               7|      85468|       William|         Davis|william.davis@yah...|
|               8|      85469|     Alexander|         Brown|alexander.brown@g...|
|               9|      85470|           Ava|        Wilson|ava.wilson@hotmai...|
|              1

In [32]:
#spark.sql("DESCRIBE FORMATTED retail_DWH.branches_TRX_fact").show(50,truncate=False)

#SET hive.exec.dynamic.partition = true;
#SET hive.exec.dynamic.partition.mode = nonstrict;
#SET hive.mapred.mode = nonstrict;


#date_dim

key,value
hive.enforce.buck...,True
