In [2]:
import re
from pyspark.sql import SparkSession
from functools import reduce
from pyspark.sql.functions import col, regexp_replace, trim,when ,monotonically_increasing_id,lit,year, month, dayofmonth, weekofyear, dayofweek, date_format,floor,dense_rank,\
substring,concat,split, row_number
from pyspark.sql.window import Window
from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
from datetime import date, datetime, timedelta
import subprocess
from py4j.java_gateway import java_import
import os
from pyspark.sql.types import DateType

In [3]:
spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("sales_transactions")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .enableHiveSupport()\
    .getOrCreate()

sc = spark.sparkContext

In [4]:
now = datetime.now()
date_str = now.strftime("%Y%m%d")
hour_str = now.strftime("%H")
print(date_str, hour_str)

20240708 18


In [15]:
input_trans = spark.read.csv(f"hdfs:///data/retail_silver/20240703/14/sales_transactions_SS_cleaned_20240703_14.csv", header='true')


In [16]:
#function_to_rename_in_hdfs
def rename_in_hdfs(golden_layer_path,file_extension,name):
    # Run the Hadoop fs -ls command to list files
    list_files_process = subprocess.run(["hadoop", "fs", "-ls", golden_layer_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Check for errors
    if list_files_process.returncode != 0:
        print(f"Error listing files in {golden_layer_path}: {list_files_process.stderr.decode()}")
        exit(1)

    # Decode stdout to string format and split lines
    stdout_str = list_files_process.stdout.decode()
    file_list = stdout_str.splitlines()

    # Find the file to rename based on criteria
    file_to_rename = None
    for line in file_list:
        if line.endswith(file_extension):
            file_to_rename = line.split()[-1].strip()
            break

    # Check if a file matching the criteria was found
    if file_to_rename:
        new_filename = f"{golden_layer_path}/{name}{file_extension}"

        # Move (rename) the file
        subprocess.run(["hadoop", "fs", "-mv", file_to_rename, new_filename])

        print(f"File moved and renamed to: {new_filename}")
    else:
        print("File matching the criteria not found.")

In [7]:
# Function to check if a file exists in HDFS
def check_if_exists(path):
    jvm = spark._jvm
    jsc = spark._jsc
    fs = jvm.org.apache.hadoop.fs.FileSystem.get(jsc.hadoopConfiguration())
    return fs.exists(jvm.org.apache.hadoop.fs.Path(path))

In [19]:
#write customer dim in HDFS
cust_dim = input_trans.select('customer_id', 'customer_fname', 'cusomter_lname', 'customer_email')
cust_dim = cust_dim.dropDuplicates(['customer_id'])

golden_layer_path="hdfs:///data/golden_layer/cust_dim"
file_extension = ".csv"
name='cust_dim'
path_to_check = f"hdfs:///data/golden_layer/cust_dim/{name}{file_extension}"

if check_if_exists(golden_layer_path):
    existing_cust_dim = spark.read.csv("/data/golden_layer/cust_dim/cust_dim.csv", header='true')
    existing_cust_dim_without_sk= existing_cust_dim.select('customer_id', 'customer_fname', 'cusomter_lname', 'customer_email')
    new_customers_data = cust_dim.subtract(existing_cust_dim_without_sk)
    print(new_customers_data)
    
    # Get the maximum surrogate key from existing data
    max_sur_key = existing_cust_dim.agg({"customer_sur_key": "max"}).collect()[0][0]
    print(max_sur_key)
    # Combine existing data with new data
    
    if new_customers_data.rdd.isEmpty() == False:
        window_spec = Window.orderBy("customer_id")
       # Add surrogate keys to new data starting from max_sur_key + 1
        customers_dim = new_customers_data.withColumn('customer_sur_key', (row_number().over(window_spec) + max_sur_key).cast("int"))
        customers_dim_sk = customers_dim.select('customer_sur_key','customer_id', 'customer_fname', 'cusomter_lname', 'customer_email')
        updated_customers_dim = existing_cust_dim.union(customers_dim_sk)
        updated_customers_dim = updated_customers_dim.repartition(1)
        updated_customers_dim.write.mode('overwrite') \
                    .option("header", "true") \
                    .format('csv') \
                    .save(f"{golden_layer_path}/tmp")
        
        rename_in_hdfs(f"{golden_layer_path}/tmp", file_extension, name) 
        subprocess.run(["hadoop", "fs", "-rm", path_to_check])
        subprocess.run(["hadoop", "fs", "-mv", f"{golden_layer_path}/tmp/{name}{file_extension}" ,golden_layer_path])
        print("done")
else:
    # Add a sequential surrogate key column
    window_spec = Window.orderBy("customer_id")
    cust_dim = cust_dim.withColumn('customer_sur_key', row_number().over(window_spec))

    #to write cust_dim in one file 
    cust_dim = cust_dim.repartition(1)

    #make customer dim 
    cust_dim = cust_dim.select('customer_sur_key','customer_id', 'customer_fname', 'cusomter_lname', 'customer_email') 
    cust_dim.write.mode('overwrite') \
            .option("header", "true") \
            .format('csv') \
            .save(golden_layer_path)
    cust_dim.show(5)

    rename_in_hdfs(golden_layer_path,file_extension,name)

+-----------+--------------+--------------+--------------+
|customer_id|customer_fname|cusomter_lname|customer_email|
+-----------+--------------+--------------+--------------+
+-----------+--------------+--------------+--------------+

99


In [9]:
# Ensure product_dim is distinct by product_id and add a sequential surrogate key
product_dim = input_trans.dropDuplicates(['product_id'])
window_spec = Window.orderBy("product_id")
product_dim = product_dim.withColumn('product_sur_key', row_number().over(window_spec))

# Repartition to one file for efficient writing
product_dim = product_dim.repartition(1)

# Define the golden layer path and file details
golden_layer_path = "hdfs:///data/golden_layer/product_dim"
file_extension = ".csv"
name = 'product_dim'

# Select relevant columns for product_dim
product_dim = product_dim.select('product_sur_key', 'product_id', 'product_name', 'product_category')

# Write product_dim to HDFS in CSV format
product_dim.write.mode('overwrite') \
            .option("header", "true") \
            .format('csv') \
            .save(golden_layer_path)

# Show the first 5 rows of product_dim (optional)
product_dim.show(5)

# Rename the file in HDFS if necessary
rename_in_hdfs(golden_layer_path, file_extension, name)

+---------------+----------+------------+----------------+
|product_sur_key|product_id|product_name|product_category|
+---------------+----------+------------+----------------+
|              1|         1|      Laptop|     Electronics|
|              2|        10|     Sandals|        Footwear|
|              3|        11|          TV|     Electronics|
|              4|        12|     Monitor|     Electronics|
|              5|        13|     Printer|     Electronics|
+---------------+----------+------------+----------------+
only showing top 5 rows

File moved and renamed to: hdfs:///data/golden_layer/product_dim/product_dim_20240707_22.csv


In [118]:
# Define the file path for the initial CSV data and the golden layer path on HDFS
file_path = f"hdfs:///data/retail_bronze/{date_str}/{hour_str}/branches_SS_raw_{date_str}_{hour_str}.csv"
golden_layer_path = "hdfs:///data/golden_layer/branches_dim"
file_extension = ".csv"
name = "branches_dim"
path_to_check = f"hdfs:///data/golden_layer/branches_dim/{name}{file_extension}"

# Load the CSV data into a PySpark DataFrame
branches_dim = spark.read.option("header", "true").csv(file_path)

# Convert establish_date to date type if needed
branches_dim = branches_dim.withColumn("establish_date", col("establish_date").cast("date"))
print(branches_dim)

# Drop duplicates based on branch_id if necessary
branches_dim = branches_dim.dropDuplicates(['branch_id'])

if check_if_exists(path_to_check):
    existing_branch_dim = spark.read.csv(path_to_check, header=True)
    existing_branch_dim_without_sk = existing_branch_dim.select('branch_id', 'location', 'establish_date', 'class')
    
    new_branches_data = branches_dim.subtract(existing_branch_dim_without_sk)
    print(new_branches_data)
    
    # Get the maximum surrogate key from existing data
    max_sur_key = existing_branch_dim.agg({"branch_sur_key": "max"}).collect()[0][0]
    print(max_sur_key)
    # Add surrogate keys to new data starting from max_sur_key + 1

    
    # Combine existing data with new data
    
    if new_branches_data.rdd.isEmpty() == False:
        window_spec = Window.orderBy("branch_id")
        branches_dim = new_branches_data.withColumn('branch_sur_key', (row_number().over(window_spec) + max_sur_key).cast("int"))
        branches_dim_sk = branches_dim.select('branch_sur_key','branch_id', 'location', 'establish_date', 'class')
        updated_branches_dim = existing_branch_dim.union(branches_dim_sk)
        updated_branches_dim = updated_branches_dim.repartition(1)
        updated_branches_dim.write.mode('overwrite') \
                    .option("header", "true") \
                    .format('csv') \
                    .save(f"{golden_layer_path}/tmp")
        
        rename_in_hdfs(f"{golden_layer_path}/tmp", file_extension, name) 
        subprocess.run(["hadoop", "fs", "-rm", path_to_check])
        subprocess.run(["hadoop", "fs", "-mv", f"{golden_layer_path}/tmp/{name}{file_extension}" ,golden_layer_path])
else:
    # Add a sequential surrogate key column
    window_spec = Window.orderBy("branch_id")
    branches_dim = branches_dim.withColumn('branch_sur_key', row_number().over(window_spec))
    updated_branches_dim = branches_dim

    updated_branches_dim = updated_branches_dim.select('branch_sur_key', 'branch_id', 'location', 'establish_date', 'class')

    updated_branches_dim.show()

    # Write the updated data back to HDFS
    updated_branches_dim.write.mode('overwrite') \
        .option("header", "true") \
        .format('csv') \
        .save(golden_layer_path)

    rename_in_hdfs(golden_layer_path, file_extension, name)

+---------+-----------+--------------+-----+
|branch_id|   location|establish_date|class|
+---------+-----------+--------------+-----+
|        1|   New York|    2017-01-15|    A|
|        2|Los Angeles|    2016-07-28|    B|
|        3|    Chicago|    2015-03-10|    A|
|        4|    Houston|    2016-11-05|    D|
|        5|    Phoenix|    2017-09-20|    C|
+---------+-----------+--------------+-----+

+---------+--------+--------------+-----+
|branch_id|location|establish_date|class|
+---------+--------+--------------+-----+
+---------+--------+--------------+-----+

5


In [88]:
# Define the file path for the initial CSV data and the golden layer path on HDFS
file_path = f"hdfs:///data/retail_bronze/{date_str}/{hour_str}/sales_agents_SS_raw_{date_str}_{hour_str}.csv"
golden_layer_path = "hdfs:///data/golden_layer/sales_agent_dim"
file_extension = ".csv"
name = "sales_agent"
path_to_check = f"hdfs:///data/golden_layer/sales_agent_dim/{name}{file_extension}"

# Load the CSV data into a PySpark DataFrame
agent_dim = spark.read.option("header", "true").csv(file_path)

# Convert hire_date to date type if needed
agent_dim = agent_dim.withColumn("hire_date", col("hire_date").cast("date"))

# Drop duplicates based on sales_person_id if necessary
agent_dim = agent_dim.dropDuplicates(['sales_person_id'])

if check_if_exists(path_to_check):
    existing_agent_dim = spark.read.csv(path_to_check, header=True)
    existing_agent_dim_with_sk = existing_agent_dim.select('sales_agent_sur_key', 'sales_person_id', 'name', 'hire_date')
    existing_agent_dim_without_sk = existing_agent_dim.select('sales_person_id', 'name', 'hire_date')
    
    new_sales_agent_data = agent_dim.subtract(existing_agent_dim_without_sk)
    print(new_sales_agent_data)
    
    # Get the maximum surrogate key from existing data
    max_sur_key = existing_agent_dim_with_sk.agg({"sales_agent_sur_key": "max"}).collect()[0][0]
    print(max_sur_key)
    # Add surrogate keys to new data starting from max_sur_key + 1
    window_spec = Window.orderBy("sales_person_id")
    agent_dim = agent_dim.withColumn('sales_agent_sur_key', row_number().over(window_spec) + max_sur_key)
    
    # Combine existing data with new data
    updated_agent_dim = existing_agent_dim_with_sk.union(agent_dim)
    updated_aegnt_dim.show()
else:
    # Add a sequential surrogate key column
    window_spec = Window.orderBy("sales_person_id")
    agent_dim = agent_dim.withColumn('sales_agent_sur_key', row_number().over(window_spec))
    updated_agent_dim = agent_dim
    updated_aegnt_dim.show()
    
updated_aegnt_dim = updated_agent_dim.select('sales_agent_sur_key', 'sales_person_id', 'name', 'hire_date')

updated_aegnt_dim.show()

# Write the updated data back to HDFS
updated_agent_dim.write.mode('overwrite') \
    .option("header", "true") \
    .format('csv') \
    .save(golden_layer_path)

rename_in_hdfs(golden_layer_path, file_extension, name)

Py4JJavaError: An error occurred while calling o2338.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 729.0 failed 1 times, most recent failure: Lost task 0.0 in stage 729.0 (TID 16204, localhost, executor driver): java.io.FileNotFoundException: File does not exist: /data/golden_layer/sales_agent_dim/sales_agent.csv
	at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86)
	at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76)
	at org.apache.hadoop.hdfs.server.namenode.FSDirStatAndListingOp.getBlockLocations(FSDirStatAndListingOp.java:156)
	at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:2070)
	at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getBlockLocations(NameNodeRpcServer.java:770)
	at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSideTranslatorPB.java:458)
	at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
	at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:532)
	at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070)
	at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1020)
	at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:948)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:422)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)
	at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2952)

It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:177)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:260)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:252)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:411)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:417)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2088)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2107)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:370)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3388)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3369)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3368)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.GeneratedMethodAccessor107.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.io.FileNotFoundException: File does not exist: /data/golden_layer/sales_agent_dim/sales_agent.csv
	at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86)
	at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76)
	at org.apache.hadoop.hdfs.server.namenode.FSDirStatAndListingOp.getBlockLocations(FSDirStatAndListingOp.java:156)
	at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:2070)
	at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getBlockLocations(NameNodeRpcServer.java:770)
	at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSideTranslatorPB.java:458)
	at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
	at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:532)
	at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070)
	at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1020)
	at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:948)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:422)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)
	at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2952)

It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:177)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:260)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:252)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:411)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:417)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [15]:


# Generate date range
start_date = date(2022, 1, 1)
end_date = date(2024, 12, 31)

date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
date_df = spark.createDataFrame([(d,) for d in date_range], ["date"]).withColumn("date", col("date").cast("date"))

# Add date attributes
date_dim = date_df.withColumn("year", year(col("date"))) \
    .withColumn("month", month(col("date"))) \
    .withColumn("day", dayofmonth(col("date"))) \
    .withColumn("week", weekofyear(col("date"))) \
    .withColumn("weekday", dayofweek(col("date"))) \
    .withColumn("quarter", floor((month(col("date")) - 1) / 3) + 1) \
    .withColumn("day_name", date_format(col("date"), "EEEE")) \
    .withColumn("month_name", date_format(col("date"), "MMMM")) \
    .withColumn("is_weekend", when(col("weekday").isin([1, 7]), lit(1)).otherwise(lit(0)))

# Add surrogate key column
date_dim = date_dim.withColumn("date_sur_key", concat(col('day'),col('month'),col('year')))


# Define the output directory for the date dimension
date_dim_path = "hdfs:///data/golden_layer/date_dim"

try:
    # Write the date dimension to a single CSV file
    date_dim.repartition(1) \
        .write.mode('overwrite') \
        .option("header", "true") \
        .format('csv') \
        .save(date_dim_path)
    print(f"Date dimension table saved to {date_dim_path}")
except Exception as e:
    print(f"An error occurred: {e}")
    
# to rename csv file in date dim
file_extension = ".csv"
name="date_dim"

rename_in_hdfs(date_dim_path, file_extension, name)

Date dimension table saved to hdfs:///data/golden_layer/date_dim
File moved and renamed to: hdfs:///data/golden_layer/date_dim/date_dim_20240707_22.csv


In [74]:
# fact One (offline)

# Filter and transform input_trans for offline transactions
branch_transaction_fact = input_trans.filter(col('is_online') == "no")
columns_to_drop = ['shipping_address', 'customer_fname', 'customer_lname',
                   'offer_1', 'offer_2', 'offer_3', 'offer_4', 'offer_5',
                   'product_name', 'product_category', 'customer_email']
branch_transaction_fact = branch_transaction_fact.drop(*columns_to_drop)

# Convert 'transaction_date' to DateType
branch_transaction_fact = branch_transaction_fact.withColumn("transaction_date", col("transaction_date").cast(DateType()))

# Calculate total_price
final_price = (col('units') * col('unit_price') * (1 - col('discount_perc') / 100))
branch_transaction_fact = branch_transaction_fact.withColumn("total_price", final_price)

# Join with dimension tables
branch_transaction_fact = branch_transaction_fact.join(cust_dim, on='customer_id', how='left') \
                           .join(product_dim, on='product_id', how='left') \
                           .join(date_dim, date_dim.date == branch_transaction_fact.transaction_date, 'left') \
                           .join(agent_dim, agent_dim.sales_person_id == branch_transaction_fact.sales_agent_id, 'left') \
                           .join(branches_dim, branches_dim.branch_id == branch_transaction_fact.branch_id, 'left')

branch_transaction_fact = branch_transaction_fact.select(
    'transaction_id',
    'branch_sur_key',
    'product_sur_key',
    'customer_sur_key',
    'sales_agent_sur_key',
    'date_sur_key',
    'units',
    'unit_price',
    'discount_perc',
    'total_price',
    'payment_method'
)

# Define output path for the fact table
fact_off_dim_path = "hdfs:///data/golden_layer/branch_transaction_fact"
file_extension = ".csv"

try:
    # Repartition and write the fact table to a single CSV file
    offline_fact.repartition(1) \
                .write.mode('overwrite') \
                .option("header", "true") \
                .format('csv') \
                .save(fact_off_dim_path)

    # Rename CSV file in HDFS
    name = "branch_transaction_fact"
    rename_in_hdfs(fact_off_dim_path, file_extension, name)
    print(f"Offline fact table saved to {fact_off_dim_path}/{name}{file_extension}")

except Exception as e:
    print(f"An error occurred: {e}")

NameError: name 'product_dim' is not defined

In [38]:
#online_fact

# Filter and transform input_trans for online transactions
online_transaction_fact = input_trans.filter(col('is_online') == "yes")
columns_to_drop = ['customer_fname', 'cusomter_lname', 'sales_agent_id', 'branch_id', 'offer_1', 'offer_2',
                   'offer_3', 'offer_4', 'offer_5', 'product_name', 'product_category', 'customer_email']
online_transaction_fact = online_transaction_fact.drop(*columns_to_drop)#print(online_fact.columns)

# Convert 'transaction_date' to DateType
online_transaction_fact = online_transaction_fact.withColumn("transaction_date", col("transaction_date").cast(DateType()))

# Calculate total_price
final_price = (col('units') * col('unit_price') * (1 - col('discount_perc') / 100))
online_transaction_fact = online_transaction_fact.withColumn("total_price", final_price)

# Process 'shipping_address' column to split into separate columns
split_address_col = split(col("shipping_address"), '/')
online_transaction_fact = online_transaction_fact.withColumn('street', split_address_col.getItem(0)) \
                                                 .withColumn('city', split_address_col.getItem(1)) \
                                                 .withColumn('state', split_address_col.getItem(2)) \
                                                 .withColumn('postal_code', split_address_col.getItem(3))

# Join with dimension tables using left join
online_transaction_fact = online_transaction_fact.join(cust_dim, on='customer_id', how='left') \
                                                 .join(product_dim, on='product_id', how='left') \
                                                 .join(date_dim, date_dim.date == online_transaction_fact.transaction_date, 'left')

# Select relevant columns for the fact table
online_transaction_fact = online_transaction_fact.select(
    'transaction_id',
    'units',
    'payment_method',
    'discount_perc',
    'total_price',
    'customer_sur_key',
    'product_sur_key',
    'date_sur_key',
    'street',
    'city',
    'state',
    'postal_code'
)

# Define output path for the fact table
online_fact_path = "hdfs:///data/golden_layer/online_transaction_fact"
file_extension = ".csv"

try:
    # Repartition and write the fact table to a single CSV file
    online_fact.repartition(1) \
                .write.mode('overwrite') \
                .option("header", "true") \
                .format('csv') \
                .save(online_fact_path)

    # Rename CSV file in HDFS
    name = "online_transaction_fact"
    rename_in_hdfs(online_fact_path, file_extension, name)
    print(f"Online fact table saved to {online_fact_path}/{name}{file_extension}")

except Exception as e:
    print(f"An error occurred: {e}")

File moved and renamed to: hdfs:///data/golden_layer/online_transaction_fact/online_transaction_fact_20240707_22.csv
Online fact table saved to hdfs:///data/golden_layer/online_transaction_fact/online_transaction_fact.csv


In [101]:
# Define paths and parameters
file_path = f"hdfs:///data/retail_bronze/{date_str}/{hour_str}/branches_SS_raw_{date_str}_{hour_str}.csv"
golden_layer_path = "hdfs:///data/golden_layer/branches_dim"
file_extension = ".csv"
name = "branches_dim"
path_to_check = f"hdfs:///data/golden_layer/branches_dim/{name}{file_extension}"
backup_path = f"{golden_layer_path}/{name}_backup{file_extension}"
tmp_path = f"{golden_layer_path}/tmp/{name}{file_extension}"

# Load the CSV data into a PySpark DataFrame
branches_dim = spark.read.option("header", "true").csv(file_path)

# Convert establish_date to date type if needed
branches_dim = branches_dim.withColumn("establish_date", col("establish_date").cast("date"))

# Drop duplicates based on branch_id if necessary
branches_dim = branches_dim.dropDuplicates(['branch_id'])

if check_if_exists(path_to_check):
    existing_branch_dim = spark.read.csv(path_to_check, header=True)
    existing_branch_dim_with_sk = existing_branch_dim.select('branch_sur_key', 'branch_id', 'location', 'establish_date', 'class')
    existing_branch_dim_without_sk = existing_branch_dim.select('branch_id', 'location', 'establish_date', 'class')
    
    new_branches_data = branches_dim.subtract(existing_branch_dim_without_sk)
    
    # Get the maximum surrogate key from existing data
    max_sur_key = existing_branch_dim_with_sk.agg({"branch_sur_key": "max"}).collect()[0][0]
    
    # Add surrogate keys to new data starting from max_sur_key + 1
    window_spec = Window.orderBy("branch_id")
    new_branches_data = new_branches_data.withColumn('branch_sur_key', row_number().over(window_spec) + max_sur_key)
    
    # Combine existing data with new data
    updated_branches_dim = existing_branch_dim_with_sk.union(new_branches_data)
else:
    # Add a sequential surrogate key column
    window_spec = Window.orderBy("branch_id")
    branches_dim = branches_dim.withColumn('branch_sur_key', row_number().over(window_spec))
    updated_branches_dim = branches_dim

updated_branches_dim = updated_branches_dim.select('branch_sur_key', 'branch_id', 'location', 'establish_date', 'class')

# Write the updated data to a temporary location
updated_branches_dim.repartition(1).write.mode('overwrite') \
    .option("header", "true") \
    .format('csv') \
    .save(tmp_path)

# Move the current file to a backup location
if check_if_exists(path_to_check):
    subprocess.run(["hadoop", "fs", "-mv", path_to_check, backup_path])

# Move the new file to the final location
subprocess.run(["hadoop", "fs", "-mv", tmp_path, path_to_check])

# Remove the backup file
if check_if_exists(path_to_check):
    subprocess.run(["hadoop", "fs", "-rm", backup_path])

# Show the updated DataFrame
updated_branches_dim.show()

Py4JJavaError: An error occurred while calling o4249.showString.
: org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:226)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:146)
	at org.apache.spark.sql.execution.InputAdapter.doExecuteBroadcast(WholeStageCodegenExec.scala:387)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeBroadcast$1.apply(SparkPlan.scala:149)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeBroadcast$1.apply(SparkPlan.scala:145)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.SparkPlan.executeBroadcast(SparkPlan.scala:145)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.prepareBroadcast(BroadcastHashJoinExec.scala:117)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.codegenAnti(BroadcastHashJoinExec.scala:382)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doConsume(BroadcastHashJoinExec.scala:104)
	at org.apache.spark.sql.execution.CodegenSupport$class.constructDoConsumeFunction(WholeStageCodegenExec.scala:216)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:187)
	at org.apache.spark.sql.execution.ProjectExec.consume(basicPhysicalOperators.scala:37)
	at org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:67)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:189)
	at org.apache.spark.sql.execution.InputAdapter.consume(WholeStageCodegenExec.scala:374)
	at org.apache.spark.sql.execution.InputAdapter.doProduce(WholeStageCodegenExec.scala:403)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.InputAdapter.produce(WholeStageCodegenExec.scala:374)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:47)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:37)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:96)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.doProduceWithKeys(HashAggregateExec.scala:658)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.doProduce(HashAggregateExec.scala:166)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.produce(HashAggregateExec.scala:40)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.doProduceWithKeys(HashAggregateExec.scala:658)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.doProduce(HashAggregateExec.scala:166)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.produce(HashAggregateExec.scala:40)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:544)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:598)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:136)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.prepareShuffleDependency(ShuffleExchangeExec.scala:92)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:128)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:136)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:391)
	at org.apache.spark.sql.execution.SortExec.inputRDDs(SortExec.scala:129)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:627)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:136)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.window.WindowExec.doExecute(WindowExec.scala:302)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:136)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:391)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:43)
	at org.apache.spark.sql.execution.BaseLimitExec$class.inputRDDs(limit.scala:62)
	at org.apache.spark.sql.execution.LocalLimitExec.inputRDDs(limit.scala:98)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:627)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:136)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.UnionExec$$anonfun$doExecute$1.apply(basicPhysicalOperators.scala:590)
	at org.apache.spark.sql.execution.UnionExec$$anonfun$doExecute$1.apply(basicPhysicalOperators.scala:590)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.immutable.List.map(List.scala:296)
	at org.apache.spark.sql.execution.UnionExec.doExecute(basicPhysicalOperators.scala:590)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:136)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:252)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:344)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3388)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3369)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3368)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.GeneratedMethodAccessor107.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 819.0 failed 1 times, most recent failure: Lost task 0.0 in stage 819.0 (TID 19470, localhost, executor driver): java.io.FileNotFoundException: File does not exist: /data/golden_layer/branches_dim/branches_dim.csv/part-00000-9fa285cd-1a7d-4923-b21f-f380c13122f0-c000.csv
	at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86)
	at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76)
	at org.apache.hadoop.hdfs.server.namenode.FSDirStatAndListingOp.getBlockLocations(FSDirStatAndListingOp.java:156)
	at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:2070)
	at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getBlockLocations(NameNodeRpcServer.java:770)
	at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSideTranslatorPB.java:458)
	at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
	at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:532)
	at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070)
	at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1020)
	at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:948)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:422)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)
	at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2952)

It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:177)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:260)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:252)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:411)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:417)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2088)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2107)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2132)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
	at org.apache.spark.sql.execution.SparkPlan.executeCollectIterator(SparkPlan.scala:311)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec$$anonfun$relationFuture$1$$anonfun$apply$1.apply(BroadcastExchangeExec.scala:79)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec$$anonfun$relationFuture$1$$anonfun$apply$1.apply(BroadcastExchangeExec.scala:76)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withExecutionId$1.apply(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withExecutionId(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec$$anonfun$relationFuture$1.apply(BroadcastExchangeExec.scala:75)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec$$anonfun$relationFuture$1.apply(BroadcastExchangeExec.scala:75)
	at scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24)
	at scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.io.FileNotFoundException: File does not exist: /data/golden_layer/branches_dim/branches_dim.csv/part-00000-9fa285cd-1a7d-4923-b21f-f380c13122f0-c000.csv
	at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86)
	at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76)
	at org.apache.hadoop.hdfs.server.namenode.FSDirStatAndListingOp.getBlockLocations(FSDirStatAndListingOp.java:156)
	at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:2070)
	at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getBlockLocations(NameNodeRpcServer.java:770)
	at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSideTranslatorPB.java:458)
	at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
	at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:532)
	at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1070)
	at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1020)
	at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:948)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:422)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)
	at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2952)

It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:177)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:260)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:252)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:411)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:417)
	... 3 more
