## Data Cleaning and Standardization

In [0]:
from pyspark.sql.functions import coalesce, lit, col
from pyspark.sql.types import *

In [0]:
complaints_schema = StructType([
    StructField("Date received", StringType(), True),
    StructField("Product", StringType(), True),
    StructField("Sub-product", StringType(), True),
    StructField("Issue", StringType(), True),
    StructField("Sub-issue", StringType(), True),
    StructField("Consumer complaint narrative", StringType(), True),
    StructField("Company public response", StringType(), True),
    StructField("Company", StringType(), True),
    StructField("State", StringType(), True),
    StructField("ZIP code", StringType(), True),
    StructField("Tags", StringType(), True),
    StructField("Consumer consent provided?", StringType(), True),
    StructField("Submitted via", StringType(), True),
    StructField("Date sent to company", StringType(), True),
    StructField("Company response to consumer", StringType(), True),
    StructField("Timely response?", StringType(), True),
    StructField("Consumer disputed?", StringType(), True),
    StructField("Complaint ID", StringType(), True)
])

In [0]:
bronze_streaming_df = spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "parquet")\
    .schema(complaints_schema)\
    .option("cloudFiles.schemaLocation","abfss://silver@rcmadls10dev.dfs.core.windows.net/checkpoint_processed_complaints")\
    .load("abfss://bronze@rcmadls10dev.dfs.core.windows.net/complaints")

### Remove _rescued_data, Tags, Consumer Disputed Column

In [0]:
processed_df = bronze_streaming_df.drop('_rescued_data','Tags', 'Consumer disputed?')

### Rename Columns

In [0]:
processed_df = processed_df.withColumnsRenamed({
                            'Date received':'date_received',
                            'Product':'product',
                            'Sub-product':'sub_product',
                            'Issue':'issue',
                            'Sub-issue':'sub_issue',
                            'Consumer complaint narrative':'consumer_complaint_narrative',
                            'Company public response':'company_public_response',
                            'Company':'company',
                            'State':'state',
                            'ZIP code':'zip_code',
                            'Consumer consent provided?':'consumer_consent_provided',
                            'Submitted via':'submitted_via',
                            'Date sent to company':'date_sent_to_company',
                            'Company response to consumer':'company_response_to_consumer',
                            'Timely response?':'timely_response',
                            'Complaint ID':'complaint_id'
                          })

### Adding load_date Column

In [0]:
processed_df = processed_df.withColumn('load_date', col('_metadata.file_modification_time'))

### Standardized Column Types

In [0]:
processed_df.createOrReplaceTempView('processed_table')

In [0]:
standardized_df = spark.sql("""
                        SELECT
                            cast(date_received as DATE),
                            product,
                            sub_product,
                            issue,
                            sub_issue,
                            consumer_complaint_narrative,
                            company_public_response,
                            company,
                            state,
                            zip_code,
                            consumer_consent_provided,
                            submitted_via,
                            cast(date_sent_to_company as DATE),
                            company_response_to_consumer,
                            cast(timely_response as BOOLEAN),
                            cast(complaint_id as INT),
                            cast(load_date as TIMESTAMP)
                        FROM
                            processed_table
                    """)

### Filter out NULL values from State column

In [0]:
standardized_df = standardized_df.filter(col('state').isNotNull())

### Replace NULL values with Company has not provided public response

In [0]:
standardized_df = standardized_df.withColumn('company_public_response', coalesce(col('company_public_response'),lit('Company has not provided public response')))

### Replace NULL values in Sub-Issue with value from Issue Column

In [0]:
standardized_df = standardized_df.withColumn('sub_issue', coalesce(col('sub_issue'),col('issue')))

### Replace NULL values in Sub-Product with value from Product Column

In [0]:
standardized_df = standardized_df.withColumn('sub_product',coalesce(col('sub_product'),col('product')))

In [0]:
standardized_df.writeStream.format("delta")\
    .outputMode("append")\
    .option("checkpointLocation", "abfss://silver@rcmadls10dev.dfs.core.windows.net/checkpoint_processed_complaints")\
    .option("path", "abfss://silver@rcmadls10dev.dfs.core.windows.net/processed_complaints")\
    .trigger(once=True)\
    .start()