# Cria um autoloader para tempo de voltas

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
source_path = "dbfs:/Volumes/workspace/default/arquivos/"
# checkpoint_path = "/Volumes/workspace/default/checkpoints/lap_times"
checkpoint_path = "/Volumes/workspace/default/checkpoints/lap_times2"
table_name = "f1_bronze.lap_times"

In [0]:
schema = StructType([
    StructField("raceId", IntegerType(), True),
    StructField("driverId", IntegerType(), True),
    StructField("lap", IntegerType(), True),
    StructField("position", IntegerType(), True),
    StructField("time", StringType(), True),
    StructField("milliseconds", IntegerType(), True),
    StructField("bronze_ingestion", TimestampType(), True)
])

In [0]:
# Create the volume if it does not exist
spark.sql(
    """
CREATE VOLUME IF NOT EXISTS workspace.default.checkpoints
COMMENT 'Volume for checkpoint storage'
"""
)

In [0]:
#stream de leitura
df_stream = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.schemaLocation", checkpoint_path)
    .schema(schema)
    .load(source_path)
    .withColumn("bronze_ingestion", current_timestamp())
)

query = (
    df_stream.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", checkpoint_path)
    .option("cloudFiles.useNotifications", "true")
    .option("mergeSchema", "true")
    .trigger(availableNow=True)
    .table(table_name)
)
query.awaitTermination()

In [0]:
%sql
select count(*) from f1_bronze.lap_times limit 1