In [0]:
%run "./extractor"

In [0]:
%run "./transformer"

In [0]:
%run "./loader"

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName("AppleDataAnalysis").getOrCreate()

# Reading the transactions data from the csv file stored in DBFS
input_df = spark.read.format("csv").option("header", True).load("dbfs:/FileStore/tables/Transaction_Updated.csv")

input_df.show()

+--------------+-----------+------------+----------------+
|transaction_id|customer_id|product_name|transaction_date|
+--------------+-----------+------------+----------------+
|            11|        105|      iPhone|      2022-02-01|
|            12|        106|      iPhone|      2022-02-02|
|            13|        107|     AirPods|      2022-02-03|
|            14|        105|     AirPods|      2022-02-04|
|            15|        108|      iPhone|      2022-02-05|
|            16|        106|     MacBook|      2022-02-06|
|            17|        107|      iPhone|      2022-02-07|
|            18|        105|     MacBook|      2022-02-08|
|            19|        108|     AirPods|      2022-02-09|
|            20|        106|     AirPods|      2022-02-10|
+--------------+-----------+------------+----------------+



In [0]:
# The main code to start our program's run

class AirpodsAfteriPhoneWorkflow():
    """
    ETL Pipeline to genearate of all customers who bought AirPods just after buying iPhone.
    """
    def __init__(self):
        pass

    def runner(self):
        # ETL Pipeline
        # 1. Extract all data from different sources
        inputDFs = AirpodsAfteriPhoneExtractor().extract() 

        # 2. Tranform logic implementation to find customers who bought AirPods right after 
        # buying an iPhone.
        firstTransformedDF = AirpodsAfteriPhoneTransformer().transform(
            inputDFs = inputDFs
        )

        # 3. Load all the data to different sinks.
        AirpodsAfteriPhoneLoader(transformedDF = firstTransformedDF).sink()

        # Testing to see if data was loaded correctly
        load_test_df = spark.read.format("delta").load("dbfs:/FileStore/tables/apple_analysis/output/airpodsAfteriPhone")
        print("Stored data:-")
        load_test_df.show()


In [0]:
# The main code to start our program's run

class OnlyAirpodsAndiPhoneWorkflow():
    """
    ETL Pipeline to genearate of all customers who bought AirPods just after buying iPhone.
    """
    def __init__(self):
        pass

    def runner(self):
        # ETL Pipeline
        # 1. Extract all data from different sources using the same extractor as before since the required dataframes
        # are the same.
        inputDFs = AirpodsAfteriPhoneExtractor().extract() 

        # 2. Tranform logic implementation to find customers who bought AirPods right after 
        # buying an iPhone.
        transformedDF = OnlyAirpodsAndiPhoneTransformer().transform(
            inputDFs = inputDFs
        )

        # 3. Load all the data to different sinks.
        OnlyAirpodsAndiPhoneLoader(transformedDF = transformedDF).sink()

        # Testing to see if data was loaded correctly
        load_test_df = spark.read.format("delta").load("dbfs:/FileStore/tables/apple_analysis/output/airpodsOnlyiPhone")
        print("Stored data from Datalake:-")
        load_test_df.show()

        load_test_df = spark.read.format("delta").table("default.onlyAirpodsAndiPhone")
        print("Stored data from Delta table:-")
        load_test_df.show()

In [0]:
# In place of a scheduler like Apache Airflow we are definning a class to call each use cases's runner
class WorkflowRunner():
    def __init__(self, name):
        self.name = name

    def runner(self):
        if(self.name == "AirpodsAfteriPhoneWorkflow"):
            return(AirpodsAfteriPhoneWorkflow().runner())
        elif(self.name == "OnlyAirpodsAndiPhoneWorkflow"):
            return(OnlyAirpodsAndiPhoneWorkflow().runner())
        else:
            raise ValueError(f"Not implemented for \'{self.name}\'")
        

name = "OnlyAirpodsAndiPhoneWorkflow"
WorkflowRunner(name).runner()

Customers dataframe: -
+-----------+-------------+----------+--------+
|customer_id|customer_name| join_date|location|
+-----------+-------------+----------+--------+
|        105|          Eva|2022-01-01|    Ohio|
|        106|        Frank|2022-02-01|  Nevada|
|        107|        Grace|2022-03-01|Colorado|
|        108|        Henry|2022-04-01|    Utah|
+-----------+-------------+----------+--------+

Transactions dataframe before the tranform: -
+--------------+-----------+------------+----------------+
|transaction_id|customer_id|product_name|transaction_date|
+--------------+-----------+------------+----------------+
|            11|        105|      iPhone|      2022-02-01|
|            12|        106|      iPhone|      2022-02-02|
|            13|        107|     AirPods|      2022-02-03|
|            14|        105|     AirPods|      2022-02-04|
|            15|        108|      iPhone|      2022-02-05|
|            16|        106|     MacBook|      2022-02-06|
|            17

In [0]:
dbutils.fs.rm("dbfs:/FileStore/tables/apple_analysis/output/airpodsOnlyiPhone", recurse=True)

