### Import libraries

In [1]:
from pyspark.sql import functions as F
import sys, os
path = os.path.dirname(os.getcwd()) + '/BaseUtils/'
sys.path.append(os.path.abspath(path))

from hdfs_io import *

hdfs_obj = HDFS_IO()

In [2]:
# Instanciate FLightRadarAPI

from FlightRadar24 import FlightRadar24API
fr_api = FlightRadar24API()

### Get Airports

In [3]:
airports = fr_api.get_airports()

In [4]:
print(dir(airports[0]))

['_Airport__get_info', '_Airport__initialize_with_basic_info', '_Airport__initialize_with_info', '__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_default_text', 'altitude', 'country', 'get_distance_from', 'iata', 'icao', 'latitude', 'longitude', 'name', 'set_airport_details']


In [5]:
airports_tuples = [{
    'iata': obj.iata,
    'icao': obj.icao,
    'name': obj.name,
    'altitude': float(obj.altitude) if obj.altitude is not None else None,
    'latitude': float(obj.latitude) if obj.latitude is not None else None,
    'longitude': float(obj.longitude) if obj.longitude is not None else None,
    'country': obj.country
                   } for obj in airports]

### Start Spark

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql import types as T

spark = SparkSession.builder \
    .appName("LocalSpark") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

print("Spark version:", spark.version)

Spark version: 3.5.0


#### Create airports dataframe

In [7]:
# Define the schema explicitly
airports_schema = T.StructType([
    T.StructField("iata", T.StringType(), True),
    T.StructField("icao", T.StringType(), True),
    T.StructField("name", T.StringType(), True),
    T.StructField("altitude", T.DoubleType(), True),   # Explicitly defining as DoubleType
    T.StructField("latitude", T.DoubleType(), True),   # Explicitly defining as DoubleType
    T.StructField("longitude", T.DoubleType(), True),  # Explicitly defining as DoubleType
    T.StructField("country", T.StringType(), True)
])

In [8]:
airports_df = spark.createDataFrame(airports_tuples, airports_schema)

In [9]:
airports_df.show()

+----+----+--------------------+--------+---------+-----------+--------------------+
|iata|icao|                name|altitude| latitude|  longitude|             country|
+----+----+--------------------+--------+---------+-----------+--------------------+
| LCG|LECO|    A Coruna Airport|   326.0|43.302059|   -8.37725|               Spain|
| AAH|EDKA|Aachen Merzbruck ...|   626.0|50.821899|   6.184759|             Germany|
| AAL|EKYT|     Aalborg Airport|     3.0|57.095112|   9.855172|             Denmark|
| AAR|EKAH|      Aarhus Airport|    71.0|56.303295|  10.619129|             Denmark|
| QEA|EKAC|  Aarhus Sea Airport|     1.0|56.151993|  10.247725|             Denmark|
| JEG|BGAA|     Aasiaat Airport|    74.0| 68.72184| -52.784698|           Greenland|
| ABD|OIAA|      Abadan Airport|    19.0|30.371111|  48.228329|                Iran|
| ABA|UNAA|Abakan Internatio...|   831.0|53.740002|  91.385002|              Russia|
| YXX|CYXX|Abbotsford Intern...|   195.0|49.025269|-122.360001|  

#### Store to HDFS

In [10]:
destination_path = hdfs_obj.base_url + hdfs_obj.user_path + 'FlightRadarApi/airports'
print(destination_path)

hdfs://namenode:9000/user/jovyan/FlightRadarApi/airports


In [11]:
airports_df.count()

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 55068)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py

Py4JError: An error occurred while calling o39.count

In [None]:
airports_df.write.parquet(destination_path, mode='overwrite')

In [None]:
spark.stop()