In [0]:
%run "./transform"

In [0]:
%run "./extract"

In [0]:
%run "./load"

In [0]:
# Creating folder 'output' to store results
# dbutils.fs.mkdirs("dbfs:/FileStore/tables/output")

In [0]:
class AirpodsAfterIphoneWorkFlow:
    """
    ETL pipeline to generate details of customers who have bought Airpods just after iPhone
    """
    def __init__(self):
        pass

    def runner(self):
        
        # Step 1: Extract all required data from different sources
        inputDFs = AirpodsAfterIphoneExtractor().extract()

        # Step 2: Implement the Transformation logic
        # Customers who have bought Airpods immediately after buying the iPhone
        firstTransformedDF = AirpodsAfterIphone().transform(inputDFs)

        # Step 3: Load all required data from different sinks
        AirpodsAfterIphoneLoader(firstTransformedDF).sink()


In [0]:
class OnlyAirpodsAfterIphoneWorkFlow:
    """
    ETL pipeline to generate details of customers who have bought both Airpods and iPhone only
    """
    def __init__(self):
        pass

    def runner(self):
        
        # Step 1: Extract all required data from different sources
        inputDFs = AirpodsAfterIphoneExtractor().extract()

        # Step 2: Implement the Transformation logic
        # Customers who have bought both Airpods and iPhone
        secondTransformedDF = OnlyAirpodsAfterIphone().transform(inputDFs)

        # Step 3: Load all required data from different sinks
        OnlyAirpodsAndIphoneLoader(secondTransformedDF).sink()


In [0]:
class ProductsAfterInitialPurchaseWorkFlow:
    """
    ETL pipeline to generate details of products bought by customers after their initial purchase
    """
    def __init__(self):
        pass

    def runner(self):
        
        # Step 1: Extract all required data from different sources
        inputDFs = AirpodsAfterIphoneExtractor().extract()

        # Step 2: Implement the Transformation logic
        # List all products bought by customers after their initial purchase
        thirdtransformedDF = ProductsAfterInitialPurchase().transform(inputDFs)

        # Step 3: Load all required data from different sinks
        ProductsAfterInitialPurchaseLoader(thirdtransformedDF).sink()


In [0]:
class AirpodsPurchaseDelayWorkFlow:
    """
    ETL pipeline to generate the average time delay for customers who bought AirPods after purchasing an iPhone.
    """
    def __init__(self):
        pass

    def runner(self):

        # Step 1: Extract all required data from different sources
        inputDFs = AirpodsAfterIphoneExtractor().extract()

        # Step 2: Implement the Transformation logic
        # Calculate the average time delay between iPhone and AirPods purchases
        fourthtransformedDF = AirpodsPurchaseDelayTransformer().transform(inputDFs)

        # Step 3: Load all required data from different sinks
        AirpodsPurchaseDelayLoader(fourthtransformedDF).sink()

In [0]:
class WorkFlowRunner:

    def __init__(self, name):
        self.name = name

    def runner(self):
        if self.name == "firstWorkflow":
            return AirpodsAfterIphoneWorkFlow().runner()
        elif self.name == "secondWorkflow":
            return OnlyAirpodsAfterIphoneWorkFlow().runner()
        elif self.name == "thirdWorkflow":
            return ProductsAfterInitialPurchaseWorkFlow().runner()
        elif self.name == "fourthWorkflow":
            return AirpodsPurchaseDelayWorkFlow().runner()
        else:
            return ValueError(f"Not Implemented for {self.name}")

name = "fourthWorkflow"

workflowRunner = WorkFlowRunner(name).runner()


+--------------+-----------+------------+----------------+
|transaction_id|customer_id|product_name|transaction_date|
+--------------+-----------+------------+----------------+
|            11|        105|      iPhone|      2022-02-01|
|            14|        105|     AirPods|      2022-02-04|
|            18|        105|     MacBook|      2022-02-08|
|            12|        106|      iPhone|      2022-02-02|
|            16|        106|     MacBook|      2022-02-06|
|            20|        106|     AirPods|      2022-02-10|
|            13|        107|     AirPods|      2022-02-03|
|            17|        107|      iPhone|      2022-02-07|
|            15|        108|      iPhone|      2022-02-05|
|            19|        108|     AirPods|      2022-02-09|
+--------------+-----------+------------+----------------+

+-----------+--------------------+---------------------+
|customer_id|iphone_purchase_date|airpods_purchase_date|
+-----------+--------------------+---------------------+
| 

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("spark_trial").getOrCreate()

input_df = (
    spark.
    read.
    format("csv").
    option("header", True).
    load("dbfs:/FileStore/tables/Transaction_Updated.csv")
)

input_df.show()