# Init OpenSky into Kafka

In [None]:
import requests
from requests.auth import HTTPBasicAuth
from kafka import KafkaProducer
import json
from datetime import datetime, timedelta, timezone

correspond = ["icao24", "callsign", "origin_country", "time_position",
              "last_contact", "longitude", "latitude", "baro_altitude",
              "on_ground", "velocity", "true_track", "vertical_rate",
              "sensors", "geo_altitude", "squawk", "spi", "position_source", "category"]

# OpenSky credentials
USERNAME = "username"
PASSWORD = "password"

def send_opensky_to_kafka(topic, url, fields={}):
    kafka_config = {
        'bootstrap_servers': 'kafka1:9092',
    }

    producer = KafkaProducer(
        bootstrap_servers=kafka_config['bootstrap_servers'],
        value_serializer=lambda v: json.dumps(v).encode('utf-8')
    )

    # response = requests.get(url, headers=headers, auth=HTTPBasicAuth(USERNAME, PASSWORD))
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        if data and "states" in data:
            for state in data["states"]:
                flight_info = {key: state[i] for i, key in enumerate(correspond) if key}
                flight_info["created_at"] = datetime.now(timezone.utc).isoformat()
                producer.send(topic, value=flight_info)
            producer.flush()    
            print(f"Sent {len(data['states'])} records to Kafka.")
        else:
            print("No valid data to send.")
    else:
        print(f"Failed to fetch data from OpenSky API. Status code: {response.status_code}")
        print(response.text)  # Print the response for debugging
    producer.close()

api_url_time = "https://opensky-network.org/api/states/all?extended={}&time={}"
api_url = "https://opensky-network.org/api/states/all?extended={}"

time = int((datetime.now(timezone.utc) - timedelta(minutes=5)).timestamp())
send_opensky_to_kafka("opensky", api_url.format(1), {})


Sent 9954 records to Kafka.


## Initiate beginning

## Exemple with 10 calls.

In [80]:
import time 
for i in range(10):
    print(f"{start}-{end}")        
    send_opensky_to_kafka("opensky", api_url.format(1), {})

    time.sleep(1)
    initial_date = end_date
    end_date = initial_date + timedelta(minutes=step)
    start = initial_date.strftime(date_format)
    end = end_date.strftime(date_format)

2025-01-26 14:00:00-2025-01-26 15:00:00


KeyboardInterrupt: 

-------------------------------------------
Batch: 22
-------------------------------------------
+------------+----------+-------------+
|window_start|window_end|count_records|
+------------+----------+-------------+
+------------+----------+-------------+

-------------------------------------------
Batch: 23
-------------------------------------------
+------------+----------+-------------+
|window_start|window_end|count_records|
+------------+----------+-------------+
+------------+----------+-------------+



                                                                                

-------------------------------------------
Batch: 24
-------------------------------------------
+------------+----------+-------------+
|window_start|window_end|count_records|
+------------+----------+-------------+
+------------+----------+-------------+



## Imports

In [74]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window, count, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType


## Spark

In [68]:
conf = SparkConf() \
    .setAppName('SparkApp') \
    .setMaster('spark://spark:7077') \
    .set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3") \
    .set("spark.sql.shuffle.partitions", "10")
 

sc = SparkContext.getOrCreate(conf=conf)

# Créer un SQLContext pour les opérations SQL
sql_context = SQLContext(sc)
kafka_broker = "kafka1:9092"
kafka_topic = "opensky"



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

# Kafka configuration
kafka_broker = "kafka1:9092"
kafka_topic = "opensky"  # Update if necessary

# Define the schema for Kafka message, including `created_at`
schema = StructType([
    StructField("icao24", StringType(), True),
    StructField("origin_country", IntegerType(), True),
    StructField("created_at", StringType(), True)  # created_at as a string (to be converted later)
])

# Initialize Spark session
spark = SparkSession.builder.appName("KafkaStreamExample").getOrCreate()

# Read raw data from Kafka
raw_stream = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_broker) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .load()

# Parse Kafka messages and convert `created_at` to `TimestampType`
parsed_stream = raw_stream.selectExpr("CAST(value AS STRING) AS message") \
    .select(from_json(col("message"), schema).alias("data")) \
    .select(
        col("data.icao24").alias("icao24"),
        col("data.origin_country").alias("origin_country"),
        to_timestamp(col("data.created_at"), "yyyy-MM-dd HH:mm:ss").alias("created_at")  # Convert `created_at` to Timestamp
    )

# Filter out records where `icao24` or `origin_country` or `created_at` is null

# Output the `icao24` values to the console (in each batch)
query = parsed_stream.select("icao24").writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

query.awaitTermination()  # Ensure the query runs until termination


25/03/18 14:21:38 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-e65ef5d3-ecad-4f83-bdde-002a2ca197ed. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/03/18 14:21:38 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/03/18 14:21:38 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------+
|icao24|
+------+
+------+

