Following the factory method pattern, we create a reader factories to read data from different data sources.

In [0]:
# Classes for different data sources
class DataSource():                 # Abstract class
    def __init__(self, path):
        self.path = path

    def get_dataframe(self):        # Abstract method to be implemented in the subclasses
        raise ValueError(f"Abstract Method \"get_dataframe\" has not been implemented")

class CSVDataSource(DataSource):    # Getting from csv file stored in DBFS
    def get_dataframe(self):
        return (
            spark.read.format("csv").option("header", True).load(self.path)     
            # .option : To ensure that the first row in the csv file is treated as the header
        )

class ParquetDataSource(DataSource):
    def get_dataframe(self):
        return (
            spark.read.format("parquet").load(self.path)
        )

class DeltaDataSource(DataSource):        # Getting data from table stored in the databricks database (created earlier in Catalog section with the csv uploaded to DBFS)
    def get_dataframe(self):
        table_name = self.path

        return (
            spark.read.table(table_name)
        )

In [0]:
# Function to get the required class
def get_data_source(data_type, file_path):
    if(data_type == "csv"):
        return CSVDataSource(file_path)
    elif(data_type == "parquet"):
        return ParquetDataSource(file_path)
    elif(data_type == "delta"):
        return DeltaDataSource(file_path)
    else:
        raise ValueError(f"Data Source not implemented for data type : {data_type}")
    