# Batch request 1


## Imports

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import from_json, col, window, avg, count
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, FloatType


In [2]:
conf = SparkConf() \
    .setAppName('SparkApp') \
    .setMaster('spark://spark:7077') \
    .set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3") \
    .set("spark.sql.shuffle.partitions", "10")
 

sc = SparkContext.getOrCreate(conf=conf)

from pyspark.sql import SQLContext
# Créer un SQLContext pour les opérations SQL
sql_context = SQLContext(sc)

:: loading settings :: url = jar:file:/opt/conda/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5373b490-bd1e-4764-add3-18dba6f9ffa4;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.3 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.3 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	

In [None]:
# Kafka configuration
kafka_broker = "kafka1:9092"
kafka_topic = "opensky"


schema = StructType([
    StructField("icao24", StringType(), True),
    StructField("callsign", StringType(), True),
    StructField("origin_country", StringType(), True),
    StructField("time_position", IntegerType(), True),
    StructField("on_ground", StringType(), True),
    StructField("velocity", FloatType(), True),  # Speed in m/s
])

# Read raw data from Kafka for batch processing
raw_stream = sql_context.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_broker) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .load()

# Parse Kafka messages
## dropDuplicates : retirer les éléments ayant la même valeur
parsed_stream = raw_stream.selectExpr("CAST(value AS STRING) AS message") \
    .select(from_json(col("message"), schema).alias("data")) \
    .select(
        col("data.icao24").alias("icao24"),
        col("data.callsign").alias("callsign"),
        col("data.origin_country").alias("origin_country"),
        (col("data.time_position").cast(TimestampType()).alias("time_position")),
        col("data.velocity").cast(FloatType()).alias("velocity"),
        col("data.on_ground").alias("on_ground")
    ) \
    .filter(col("data.velocity").isNotNull() & (col("data.on_ground") == "false"))


# Perform rolling average over a 5-minute window
rolling_avg_df = parsed_stream \
    .groupBy(window(col("time_position"), "5 minutes")) \
    .agg(
        avg("velocity").alias("rolling_avg_velocity"),  # Calculating the rolling average of velocity
        count("icao24").alias("num_flights")  # Count the number of flights in the window
    ) \
    .select(
        col("window.start").alias("window_start"),
        col("window.end").alias("window_end"),
        col("rolling_avg_velocity"),
        col("num_flights")
    )

# Collect the result as a Pandas DataFrame for further analysis or reporting
pandas_df = rolling_avg_df.toPandas()

# Print the rolling average result
print(pandas_df)

#import seaborn as sns
#import matplotlib.dates as md

25/03/19 07:34:12 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
[Stage 0:>                                                          (0 + 1) / 2]