In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructField, StructType, LongType
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, StructField, StructType, LongType
from pyspark.sql.functions import from_json

In [2]:
#Criar a sessão Spark
spark = SparkSession \
    .builder \
    .appName("Streaming from Kafka") \
    .config("spark.streaming.stopGracefullyOnShutdown", True) \
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0') \
    .config("spark.sql.shuffle.partitions", 4) \
    .master("local[*]") \
    .getOrCreate()

spark

In [3]:
# Crear o sparkdf para o kafka
streaming_df = spark.readStream\
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "devices") \
    .option("startingOffsets", "earliest") \
    .option("failOnDataLoss", "false")\
    .load()

In [4]:
#Definir o schema para o streaming
json_schema = StructType([StructField('placa', StringType(), True),
                            StructField('tipo', StringType(), True),
                            StructField('cor', StringType(), True),
                            StructField('data', StringType(), True)]
                          )

In [5]:
# Parsear o valor de binário para string
json_df = streaming_df.selectExpr("cast(value as string) as value")

# Aplicar o esquema à coluna de valor JSON e expandir o valor
json_expanded_df = json_df.withColumn("value", from_json(json_df["value"], json_schema)).select("value.*")


In [7]:
# Dar flatten no dataframe
flattened_df = json_expanded_df \
    .selectExpr("placa", "tipo", "cast(data as timestamp) as data", "cor")

In [8]:
#Conta a quantidade de carros por tipo e cor
agg_df = flattened_df.groupBy("tipo", "cor") \
    .count() \
    .withColumnRenamed("count", "quantidade") \
    .withColumn("quantidade", col("quantidade").cast(LongType()))

In [None]:
# Escrever o resultado no console
writing_df = agg_df.writeStream \
    .format("console") \
    .option("checkpointLocation","checkpoint_dir") \
    .outputMode("complete") \
    .start()
    
writing_df.awaitTermination()