In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

conf = SparkConf() \
    .setAppName('SparkApp') \
    .setMaster('spark://spark:7077') \
    .set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3") \
    .set("spark.sql.shuffle.partitions", "10")
 

sc = SparkContext.getOrCreate(conf=conf)

from pyspark.sql import SQLContext
# Créer un SQLContext pour les opérations SQL
sql_context = SQLContext(sc)

:: loading settings :: url = jar:file:/opt/conda/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-72402a8a-d68f-4f5d-a485-c8b0a6535eee;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.3 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.3 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window, avg
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, FloatType

# Kafka configuration
kafka_broker = "kafka1:9092"
kafka_topic = "opensky"

# Define schema for the Kafka message
schema = StructType([
    StructField("time_position", StringType(), True),
    StructField("icao24", StringType(), True),
    StructField("velocity", FloatType(), True)
])

# Read raw data from Kafka
raw_stream = sql_context.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_broker) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "latest") \
    .load()

# Parse Kafka messages
parsed_stream = raw_stream.selectExpr("CAST(value AS STRING) AS message") \
    .select(from_json(col("message"), schema).alias("data")) \
    .select(
        col("data.time_position").cast(TimestampType()).alias("time_position"),  # Convert timestamp to Spark TimestampType
        col("data.icao24").alias("icao24"),
        col("data.velocity").cast(FloatType()).alias("velocity")  # Convert voltage to FloatType for aggregation
    ).dropDuplicates(["icao24"]).withWatermark("time_position", "5 minutes")

parsed_stream = parsed_stream.filter(col("data.velocity").isNotNull())

parsed_stream.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()



# Compute rolling average over a 5-minute window
rolling_average = parsed_stream \
    .groupBy(window(col("time_position"), "20 minutes")) \
    .agg(avg("velocity").alias("avg_velocity")) \
    .select(
        col("window.start").alias("window_start"),
        col("window.end").alias("window_end"),
        col("avg_velocity")
    )  # Explicit ordering by window start

# Output rolling average to the console
query = rolling_average.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

query.awaitTermination()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, avg, max, min, count, stddev, window, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, BooleanType, TimestampType

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("FlightDataStreamingAnalysis") \
    .master("local[*]") \
    .getOrCreate()

# Define schema for streaming data
schema = StructType([
    StructField("icao24", StringType(), True),
    StructField("callsign", StringType(), True),
    StructField("origin_country", StringType(), True),
    StructField("time_position", IntegerType(), True),  # Will be converted to TimestampType
    StructField("last_contact", IntegerType(), True),
    StructField("longitude", FloatType(), True),
    StructField("latitude", FloatType(), True),
    StructField("baro_altitude", FloatType(), True),
    StructField("on_ground", BooleanType(), True),
    StructField("velocity", FloatType(), True),
    StructField("true_track", FloatType(), True),
    StructField("vertical_rate", FloatType(), True),
    StructField("geo_altitude", FloatType(), True),
    StructField("position_source", IntegerType(), True),
    StructField("category", IntegerType(), True)
])

# Read data from Kafka topic
kafka_stream = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9092") \
    .option("subscribe", "opensky") \
    .option("startingOffsets", "earliest") \
    .load()

# Parse JSON data from Kafka
parsed_stream = kafka_stream.selectExpr("CAST(value AS STRING) AS message") \
    .select(from_json(col("message"), schema).alias("data")) \
    .select("data.*") \
    .withColumn("time_position", to_timestamp(col("time_position").cast("long")))  # 🔥 Fixed conversion

# Filter for aircraft in flight
in_flight_stream = parsed_stream.filter(col("on_ground") == False)

# Compute velocity insights over a 10-minute window
velocity_stats = in_flight_stream \
    .withWatermark("time_position", "10 minutes") \
    .groupBy(window(col("time_position"), "10 minutes")) \
    .agg(
        avg("velocity").alias("avg_velocity"),
        max("velocity").alias("max_velocity"),
        min("velocity").alias("min_velocity"),
        stddev("velocity").alias("std_dev_velocity")
    )

# Output Stream
velocity_stats.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .trigger(processingTime='10 seconds') \
    .start()

spark.streams.awaitAnyTermination()


25/03/18 14:34:52 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
25/03/18 14:34:52 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-061d60df-6825-4735-ae4d-4d7be4aceec9. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/03/18 14:34:52 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/03/18 14:34:53 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------+------------+------------+------------+----------------+
|window|avg_velocity|max_velocity|min_velocity|std_dev_velocity|
+------+------------+------------+------------+----------------+
+------+------------+------------+------------+----------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+------------------+------------+------------+------------------+
|window                                    |avg_velocity      |max_velocity|min_velocity|std_dev_velocity  |
+------------------------------------------+------------------+------------+------------+------------------+
|{2025-03-18 12:30:00, 2025-03-18 12:40:00}|128.25143767525546|324.18      |0.0         |89.4542676661795  |
|{2025-03-18 12:40:00, 2025-03-18 12:50:00}|167.30259182744436|359.85      |0.0         |76.02537937617127 |
|{2025-03-18 12:10:00, 2025-03-18 12:20:00}|108.89285564422607|239.65      |4.12        |91.86587168308918 |
|{2025-03-18 11:20:00, 2025-03-18 11:30:00}|247.67499542236328|263.93      |231.42      |18.769654079874986|
|{2025-03-18 12:20:00, 2025-03-18 12:30:00}|227.75866425832112|295.22      |12.87       |65.86121602031571 |
|{2025-03-18 13:00:00, 2025-03-

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 