In [0]:
df  = spark.read.format('csv')\
        .option('header', True)\
        .option('inferSchema', True)\
        .load('/Volumes/pyspark_dbt/source/raw_data/customers/')

In [0]:
df.limit(10).display()

In [0]:
df.schema

In [0]:
schema_customer = df.schema
schema_customer

### SPARK STREAMING

##### This is the static way to load the data

In [0]:
# Static way and need to write the same set of code for all the tables and files 
df = spark.readStream.format('csv')\
          .option('header', True)\
          .schema(schema_customer)\
          .load('/Volumes/pyspark_dbt/source/raw_data/customers/')


df.writeStream.format('delta')\
          .outputMode('append')\
          .option('checkpointLocation', '/Volumes/pyspark_dbt/bronze/checkpoints/customers')\
          .trigger(once= True)\
          .toTable('pyspark_dbt.bronze.customers')

##### Dynamic ingestion

In [0]:
# Incremental load processes only new CSV files using Structured Streaming with checkpointing

data_source = ['customers', 'trips', 'payments', 'locations', 'vehicles', 'drivers']

for data_name in data_source:

      df_batch  = spark.read.format('csv')\
        .option('header', True)\
        .option('inferSchema', True)\
        .load(f'/Volumes/pyspark_dbt/source/raw_data/{data_name}/')

      schema_entity = df_batch.schema

      df = spark.readStream.format('csv')\
          .option('header', True)\
          .schema(schema_entity)\
          .load(f'/Volumes/pyspark_dbt/source/raw_data/{data_name}/')

      df.writeStream.format('delta')\
          .outputMode('append')\
          .option('checkpointLocation', f'/Volumes/pyspark_dbt/bronze/checkpoints/{data_name}')\
          .trigger(once= True)\
          .toTable(f'pyspark_dbt.bronze.{data_name}')