#Paths and Modules to Include
##Purpose:
- This notebook contains paths to directories containing the orginal csv's on the DBFS, as well as directories that will be used to contain the parquet files after ingestion and transformation.
- Also included:
  - A class csv_reader that loads each csv file into a dataframe that is generalized for both plan and rate files.  This was necessary due to changing schemas and non-quotes in one year's headers that made simple ingestion from a directory difficult.
  - A class SchemaDefiner that allows me to define a schema in python as I would a Dataclass, rather than using StructType[StructField].  This allows for a simple API to define a shema that resembles how a schema is defined in scala to use with DataSets.

In [0]:
import os
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, StructType, StructField

# raw_path = spark.sql(f'DESCRIBE EXTERNAL LOCATION landing').select('url').collect()[0].url
# raw_rates_path = os.path.join(raw_path, 'raw_rates')
# raw_plans_path = os.path.join(raw_path, 'raw_plans')

# processed_path = spark.sql(f'DESCRIBE EXTERNAL LOCATION bronze').select('url').collect()[0].url
# processed_rates_path = os.path.join(processed_path, "rates")
# processed_plans_path = os.path.join(processed_path, "plans")

In [0]:
class csv_reader:
    def __init__(self, cols_dict, schema):
        self.cols_dict = cols_dict
        self.schema = schema.schema
        self.df = None
        

    def load_csv(self, pth):
        df = spark.read.option("header", True).option("inferSchema", True).csv(pth)
        self.df = df
        return self

    def check_year_type(self):
        if dict(self.df.dtypes)["BusinessYear"] != "int":
            self.df = self.df.withColumn(
                "BusinessYear", F.col("BusinessYear").cast(IntegerType())
            ).filter(F.col("BusinessYear").isNotNull())
        return self

    def rename_cols(self):
        self.df = self.df.withColumnsRenamed(self.cols_dict)
        return self

    def select_cols(self):
        self.df = self.df.select(*self.cols_dict.values())
        return self

    def validate_schema(self):
        if self.schema != self.df.schema:
            raise Exception("Dataframe Schema different from what was expected.")
        return self
            
    def csv_to_df(self, pth):
        self.load_csv(pth).check_year_type().rename_cols().select_cols().validate_schema()
        return self.df

class SchemaDefiner:
    def __init__(self, schema):
        self.schema = schema
        
    @classmethod    
    def get_pyspark_schema(cls, data_class_obj):
        schema_dict = data_class_obj.__annotations__
        struct_fields = []
        for col, col_type in schema_dict.items():
            struct_fields.append(StructField(col, col_type()))
        return cls(StructType(struct_fields))

    def __call__(self):
        return self
        

In [0]:
print('End')

End
