In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

conf = SparkConf() \
    .setAppName('SparkApp') \
    .setMaster('spark://spark:7077') \
    .set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3") \
    .set("spark.sql.shuffle.partitions", "10")
 

sc = SparkContext.getOrCreate(conf=conf)

from pyspark.sql import SQLContext
# Créer un SQLContext pour les opérations SQL
sql_context = SQLContext(sc)

:: loading settings :: url = jar:file:/opt/conda/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-44fa7850-f157-4082-b101-687baf3a52ab;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.3 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.3 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window, avg, min, max, count, lag
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, BooleanType, TimestampType

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("FlightCrashDetection") \
    .master("local[*]") \
    .getOrCreate()

# Define schema for streaming data
schema = StructType([
    StructField("icao24", StringType(), True),
    StructField("callsign", StringType(), True),
    StructField("origin_country", StringType(), True),
    StructField("time_position", IntegerType(), True),  # Converted to Timestamp
    StructField("last_contact", IntegerType(), True),
    StructField("longitude", FloatType(), True),
    StructField("latitude", FloatType(), True),
    StructField("baro_altitude", FloatType(), True),
    StructField("on_ground", BooleanType(), True),
    StructField("velocity", FloatType(), True),
    StructField("true_track", FloatType(), True),
    StructField("vertical_rate", FloatType(), True),
    StructField("geo_altitude", FloatType(), True),
    StructField("position_source", IntegerType(), True),
    StructField("category", IntegerType(), True)
])

# Read data from Kafka topic
kafka_stream = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9092") \
    .option("subscribe", "opensky") \
    .option("startingOffsets", "earliest") \
    .load()

# Parse JSON data from Kafka
parsed_stream = kafka_stream.selectExpr("CAST(value AS STRING) AS message") \
    .select(from_json(col("message"), schema).alias("data")) \
    .select("data.*") \
    .withColumn("time_position", col("time_position").cast("timestamp"))

# Add a window for detecting altitude drops
windowed_stream = parsed_stream.withWatermark("time_position", "5 minutes") \
    .groupBy(window(col("time_position"), "1 minute"), col("icao24")) \
    .agg(
        max("geo_altitude").alias("max_altitude"),
        min("geo_altitude").alias("min_altitude")
    )

# Identify potential crashes based on sharp altitude drops
potential_crashes = windowed_stream.filter(
    (col("max_altitude") - col("min_altitude")) >= 500  # Detecting sudden drops of 1000+ meters
)

# Output stream
potential_crashes.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .trigger(processingTime='10 seconds') \
    .start()

spark.streams.awaitAnyTermination()


25/03/24 15:12:17 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
25/03/24 15:12:19 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-3eb5d79a-4a81-46a5-be8a-8fa3c424c7ca. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/03/24 15:12:19 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/03/24 15:12:20 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------+------+------------+------------+
|window|icao24|max_altitude|min_altitude|
+------+------+------------+------------+
+------+------+------------+------------+



25/03/24 15:12:34 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 14126 milliseconds
                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+------+------------+------------+
|window                                    |icao24|max_altitude|min_altitude|
+------------------------------------------+------+------------+------------+
|{2025-03-24 14:56:00, 2025-03-24 14:57:00}|899097|11163.3     |9174.48     |
|{2025-03-24 14:55:00, 2025-03-24 14:56:00}|a9a465|9281.16     |7711.44     |
|{2025-03-24 14:56:00, 2025-03-24 14:57:00}|a4592a|2209.8      |1592.58     |
+------------------------------------------+------+------------+------------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------+------+------------+------------+
|window|icao24|max_altitude|min_altitude|
+------+------+------------+------------+
+------+------+------------+------------+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------+------+------------+------------+
|window|icao24|max_altitude|min_altitude|
+------+------+------------+------------+
+------+------+------------+------------+



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+------+------+------------+------------+
|window|icao24|max_altitude|min_altitude|
+------+------+------------+------------+
+------+------+------------+------------+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+------------------------------------------+------+------------+------------+
|window                                    |icao24|max_altitude|min_altitude|
+------------------------------------------+------+------------+------------+
|{2025-03-24 15:12:00, 2025-03-24 15:13:00}|49d508|2225.04     |1668.78     |
|{2025-03-24 15:11:00, 2025-03-24 15:12:00}|a7f804|13997.94    |10302.24    |
|{2025-03-24 15:12:00, 2025-03-24 15:13:00}|a0b821|2926.08     |1234.44     |
+------------------------------------------+------+------------+------------+



25/03/25 00:55:41 ERROR TaskSchedulerImpl: Lost executor 0 on 172.18.0.2: worker lost: Not receiving heartbeat for 60 seconds
