In [0]:
dbutils.widgets.text("load_date", "")

In [0]:
load_date = dbutils.widgets.get("load_date")

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
schema = StructType([
    StructField("request_datetime", TimestampType(), True),
    StructField("request_channel", StringType(), True),
    StructField("creditcheckid", IntegerType(), True),
    StructField("customer_id", StringType(), True),
    StructField("customer_name", StringType(), True),
    StructField("phone_number", StringType(), True),
    StructField("email", StringType(), True),
    StructField("dob", DateType(), True),
    StructField("government_id", StringType(), True),
    StructField("customer_address", StringType(), True),
    StructField("customer_region_code", StringType(), True),
    StructField("device_code", StringType(), True),
    StructField("device_name", StringType(), True),
    StructField("brand", StringType(), True),
    StructField("device_price", DoubleType(), True),
    StructField("plan_code", StringType(), True),
    StructField("plan_name", StringType(), True),
    StructField("monthly_fee", DoubleType(), True),
    StructField("contract_months", IntegerType(), True),
    StructField("approval_amount", DoubleType(), True),
    StructField("decision_status", StringType(), True),
    StructField("rejection_reason", StringType(), True),
    StructField("last_updated_ts", TimestampType(), True)
    ])

In [0]:
credcheck_df = spark.read.format("csv") \
    .option("header", False) \
    .schema(schema) \
    .load(f'abfss://creditcheck@azurestorageaccnt.dfs.core.windows.net/bronze/{load_date}/fact_credit_check.csv')

In [0]:
credcheck_df.display()

In [0]:
nullfilled_df = credcheck_df.fillna("NA", subset=["rejection_reason"])

In [0]:
dedup_df = nullfilled_df.dropDuplicates(['creditcheckid'])

In [0]:
dedup_df.count()

In [0]:
deviceseg_df = dedup_df.withColumn(
    'device_segment',
    when(dedup_df.device_price < 20000, 'Budget')
    .when((dedup_df.device_price >= 20000) & (dedup_df.device_price < 50000), 'Midrange')
    .otherwise('Premium'))

In [0]:
agg_df = deviceseg_df.groupBy('device_segment').agg(count('*').alias('total')).display()

Databricks visualization. Run in Databricks to view.

In [0]:
deviceseg_df.write.mode("overwrite")\
    .format('parquet')\
        .option('path', f'abfss://silver@azurestorageaccnt.dfs.core.windows.net/{load_date}/creditcheck')\
        .saveAsTable('credcheck_catalog.silver.fact_credit_check')

