In [31]:
from pyspark.sql import SparkSession
import os
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import from_json, col, window, count
import tempfile

In [2]:
# Create the Spark Session
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Streaming from Kafka") \
    .config("spark.streaming.stopGracefullyOnShutdown", True) \
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1') \
    .config("spark.sql.shuffle.partitions", 4) \
    .master("local[*]") \
    .getOrCreate()

spark

:: loading settings :: url = jar:file:/Users/rihab.ghrab/opt/anaconda3/bin/spark/spark-3.5.1-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/rihab.ghrab/.ivy2/cache
The jars for the packages stored in: /Users/rihab.ghrab/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-18c3d6aa-df56-4723-b882-59b1f75c5a23;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.1 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 430ms :: artifacts

In [39]:
# Create the streaming_df to read from kafka
streaming_df = spark.readStream\
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "bikes") \
    .option("startingOffsets", "earliest") \
    .load()

In [40]:
streaming_df

DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

In [41]:
schema = StructType([StructField("$id", IntegerType()),
                    StructField("$type", StringType()),
                    StructField("forecastType", StringType()),
                    StructField("forecastID", StringType()),
                    StructField("forecastBand", StringType()),
                    StructField("forecastSummary", StringType()),
                    StructField("nO2Band", StringType()),
                    StructField("o3Band", StringType()),
                    StructField("pM10Band", StringType()),
                    StructField("pM25Band", StringType()),
                    StructField("sO2Band", StringType()),
                    StructField("forecastText", StringType())])

In [42]:
df_parsed = streaming_df.select(from_json(col("value").cast("string"), schema).alias("air_quality_data"), col("timestamp"))

# Extract the fields from the parsed JSON data
df_processed = df_parsed.select("air_quality_data.*", "timestamp") 


In [47]:
window_duration = '1 minutes'
# Group the tweets by language and timestamp window and count the tweets
df_counts = df_processed \
    .withWatermark("timestamp", window_duration) \
    .groupBy(window("timestamp", window_duration), "forecastID") \
    .agg(count("*").alias("record_count"))

#query_df = df_counts.writeStream \
#    .outputMode("complete") \
#    .format("console") \
#    .option("truncate", False) \
#    .start()

query = df_counts.writeStream\
  .trigger(processingTime="1 minutes")\
  .option("checkpointLocation", "checkpoint/")\
  .option("path", ".")\
  .outputMode("append")\
  .start()\
  .awaitTermination()
#df_counts.writeStream.toTable("my_table", checkpointLocation='/Users/rihab.ghrab/Infor-Ideas/Spark')


24/07/16 10:26:27 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/07/16 10:26:27 WARN StreamingQueryManager: Stopping existing streaming query [id=8edaf047-2103-4d46-8b6f-f3786557fe12, runId=6af5f20e-1a01-424b-9995-5e50d2427597], as a new run is being started.
24/07/16 10:26:27 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
24/07/16 10:26:28 WARN HDFSBackedStateStoreProvider: The state for version 4 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/07/16 10:26:28 WARN HDFSBackedStateStoreProvider: The state for version 4 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/07/16 10:26:28 WAR

KeyboardInterrupt: 

In [50]:
ds = spark.read.parquet('part-00000-580b3221-ce6c-42b1-8469-4e96b0de5f7a-c000.snappy.parquet')

In [51]:
ds.show()

+--------------------+----------+------------+
|              window|forecastID|record_count|
+--------------------+----------+------------+
|{2024-07-16 10:26...|      NULL|          25|
+--------------------+----------+------------+

