# Introduction

This project has been created using the "Flight Radar API" from "Rapid API" (https://rapidapi.com/apidojo/api/flight-radar1), who tracks the informations form FlightRadar24 (https://www.flightradar24.com/).

The aim of this project is to showcase how PySpark can be used to manage data.

# Importing libraries

In [47]:
!pip install pyspark



In [48]:
from pyspark.sql import SparkSession, Row, Window
from pyspark.sql.types import *
from pyspark.sql.functions import col, row_number, when
import requests
import json

In [100]:
from google.colab import drive
drive.mount('/content/drive')

folder = "/content/drive/MyDrive/portfolio/spark_flight_data/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [50]:
with open(folder + "headers.json", "r") as f:
  headers = json.load(f)

# Creating Saprk Session

In [51]:
spark = SparkSession\
        .builder\
        .appName("flight_data")\
        .getOrCreate()

# Airline information

## Importing JSON

In [52]:
def getData(file_path, url, headers):
  try:
    f = open(file_path)
    data = json.load(f)

  except:

    response = requests.get(url, headers=headers)

    print(response.json())

    data = response.json()['rows']

    with open(file_path, 'w') as f:
      json.dump(data, f)

  return data

In [53]:
airline_file = 'airline_data.json'
airline_url = "https://flight-radar1.p.rapidapi.com/airlines/list"

airline_data = getData(folder + airline_file, airline_url, headers=headers)

airline_data[0]

{'Name': '21 Air', 'Code': '2I', 'ICAO': 'CSB'}

## Creating airline_dim

In [54]:
airline_rdd = spark.sparkContext.parallelize(airline_data)

In [55]:
airline_dim = spark.read.json(airline_rdd)

airline_dim.show()

+----+----+------------------+
|Code|ICAO|              Name|
+----+----+------------------+
|  2I| CSB|            21 Air|
|    | EMC|      247 Aviation|
|    | BRO|   2Excel Aviation|
|    | DAK|         4 Airways|
|  Q5| MLA|       40-Mile Air|
|  FE| IHO|  748 Air Services|
|  AQ| JYH|             9 Air|
|  S5| NKP|        Abakan Air|
|    | ABP|          ABS Jets|
|    | BAR|Abu Dhabi Aviation|
|  GB| ABX|           ABX Air|
|    | SON|     Acass Ireland|
|  AN| WSN|      Advanced Air|
|  A3| AEE|   Aegean Airlines|
|  EI| EIN|        Aer Lingus|
|  EG| EUK|     Aer Lingus UK|
|  JK| ACL|         AerCaribe|
|  5E| BLK|              Aero|
|  N2| NIG|              Aero|
|    | AKF|        Aero Asahi|
+----+----+------------------+
only showing top 20 rows



# Airports information

## Importing JSON

In [56]:
airports_file = 'airports_data.json'
airports_url = "https://flight-radar1.p.rapidapi.com/airports/list"

airports_data = getData(folder + airports_file, airports_url, headers=headers)

airports_data[0]

{'id': 1900,
 'name': 'A Coruna Airport',
 'iata': 'LCG',
 'icao': 'LECO',
 'city': 'A Coruna',
 'lat': 43.302059,
 'lon': -8.37725,
 'country': 'Spain',
 'alt': 326,
 'size': 4686,
 'timezone': {'name': 'Europe/Madrid',
  'offset': 7200,
  'offsetHours': '2:00',
  'abbr': 'CEST',
  'abbrName': 'Central European Summer Time',
  'isDst': True},
 'countryId': 209}

In [57]:
# Saving timezones info in a Dictionary

timezones = {}

for airport in airports_data:

  timezone = airport["timezone"]

  timezone_name = timezone["name"]

  if timezone_name not in timezones.keys():
    timezones[timezone_name] = timezone

timezones[ list(timezones.keys())[0] ]

{'name': 'Europe/Madrid',
 'offset': 7200,
 'offsetHours': '2:00',
 'abbr': 'CEST',
 'abbrName': 'Central European Summer Time',
 'isDst': True}

## Timezone information

### Saving timezone info in a DataFrame

In [58]:
# Converting input in a list of Rows

timezones_rows = [Row(**timezones[timezone_name]) for timezone_name in timezones.keys()]

timezones_rows[:5]

[Row(name='Europe/Madrid', offset=7200, offsetHours='2:00', abbr='CEST', abbrName='Central European Summer Time', isDst=True),
 Row(name='Europe/Berlin', offset=7200, offsetHours='2:00', abbr='CEST', abbrName='Central European Summer Time', isDst=True),
 Row(name='Europe/Copenhagen', offset=7200, offsetHours='2:00', abbr='CEST', abbrName='Central European Summer Time', isDst=True),
 Row(name='America/Godthab', offset=-3600, offsetHours='-1:00', abbr='-01', abbrName=None, isDst=True),
 Row(name='Asia/Tehran', offset=12600, offsetHours='3:30', abbr='+0330', abbrName=None, isDst=False)]

In [59]:
# Defining DataFrame schema
timezones_schema = StructType([
    StructField('name', StringType()),
    StructField('offset', IntegerType()),
    StructField('offsetHours', StringType()),
    StructField('abbr', StringType()),
    StructField('abbrName', StringType()),
    StructField('isDst', BooleanType())
])

# Creating DataFrame
timezones_df = spark.createDataFrame(timezones_rows, schema = timezones_schema)

timezones_df.show()

+-------------------+------+-----------+-----+--------------------+-----+
|               name|offset|offsetHours| abbr|            abbrName|isDst|
+-------------------+------+-----------+-----+--------------------+-----+
|      Europe/Madrid|  7200|       2:00| CEST|Central European ...| true|
|      Europe/Berlin|  7200|       2:00| CEST|Central European ...| true|
|  Europe/Copenhagen|  7200|       2:00| CEST|Central European ...| true|
|    America/Godthab| -3600|      -1:00|  -01|                NULL| true|
|        Asia/Tehran| 12600|       3:30|+0330|                NULL|false|
|   Asia/Krasnoyarsk| 25200|       7:00|  +07|                NULL|false|
|  America/Vancouver|-25200|      -7:00|  PDT|Pacific Daylight ...| true|
|     Pacific/Tarawa| 43200|      12:00|  +12|                NULL|false|
|      Europe/London|  3600|       1:00|  BST| British Summer Time| true|
|    America/Chicago|-18000|      -5:00|  CDT|Central Daylight ...| true|
|        Asia/Riyadh| 10800|       3:0

### Checking if the same 'abbr' can correspond to different 'abbrName'

In [60]:
# Creting a DataFrame contarining unique abbr-abbrName pairs
abbr_check = timezones_df \
                              .select(["abbr", "abbrName"]) \
                              .distinct()

# Select abbr that appears more than one time
abbr_check \
          .groupBy('abbr') \
          .count() \
          .sort("count", ascending=False) \
          .filter( col('count') > 1) \
          .show()

+----+-----+
|abbr|count|
+----+-----+
|WEST|    2|
| IST|    2|
|CEST|    2|
| CST|    2|
+----+-----+



In [61]:
# Check information about abbr that appears more than one time
abbr_check \
          .filter( abbr_check.abbr.isin('IST', 'CST', 'WEST', 'CEST') ) \
          .show(truncate = False)

+----+----------------------------+
|abbr|abbrName                    |
+----+----------------------------+
|CST |Central Standard Time       |
|IST |India Standard Time         |
|WEST|Western European Summer Time|
|CEST|Central European Summer Time|
|CST |China Standard Time         |
|IST |Irish Standard Time         |
|CEST|NULL                        |
|WEST|NULL                        |
+----+----------------------------+



### Updating DataFrame

In [62]:
timezones_df = timezones_df \
                            .withColumn( # Replace conflitting 'abbr'
                                      'abbr',
                                        when( col('abbrName') == 'China Standard Time', 'ChiST') \
                                        .when( col('abbrName') == 'Irish Standard Time', 'IriST') \
                                        .otherwise(timezones_df.abbr)
                                        ). \
                            withColumn( # Fill missing 'abbrName'
                                      'abbrName',
                                      when( col('abbr') == 'CEST', 'Central European Summer Time' ) \
                                      .when( col('abbr') == 'WEST', 'Western European Summer Time') \
                                      .otherwise(timezones_df.abbrName)
                                        )



timezones_df.show()

+-------------------+------+-----------+-----+--------------------+-----+
|               name|offset|offsetHours| abbr|            abbrName|isDst|
+-------------------+------+-----------+-----+--------------------+-----+
|      Europe/Madrid|  7200|       2:00| CEST|Central European ...| true|
|      Europe/Berlin|  7200|       2:00| CEST|Central European ...| true|
|  Europe/Copenhagen|  7200|       2:00| CEST|Central European ...| true|
|    America/Godthab| -3600|      -1:00|  -01|                NULL| true|
|        Asia/Tehran| 12600|       3:30|+0330|                NULL|false|
|   Asia/Krasnoyarsk| 25200|       7:00|  +07|                NULL|false|
|  America/Vancouver|-25200|      -7:00|  PDT|Pacific Daylight ...| true|
|     Pacific/Tarawa| 43200|      12:00|  +12|                NULL|false|
|      Europe/London|  3600|       1:00|  BST| British Summer Time| true|
|    America/Chicago|-18000|      -5:00|  CDT|Central Daylight ...| true|
|        Asia/Riyadh| 10800|       3:0

In [63]:
# Checking if update was successfull

abbr_check = timezones_df \
                              .select(["abbr", "abbrName"]) \
                              .distinct()

### Checking if the same 'abbr' can have multiple 'isDst'

In [64]:
abbr_check = timezones_df \
                              .select(["abbr", "isDst"]) \
                              .distinct()

abbr_check \
          .groupBy('abbr') \
          .count() \
          .sort("count", ascending=False) \
          .filter( col('count') > 1) \
          .show()

abbr_check.filter( col('abbr').isin('-01', '-02') ).show()

+----+-----+
|abbr|count|
+----+-----+
| -01|    2|
| -02|    2|
+----+-----+

+----+-----+
|abbr|isDst|
+----+-----+
| -01| true|
| -01|false|
| -02|false|
| -02| true|
+----+-----+



### Splitting DataFrame in different tables

In [65]:
timezones_df.columns

['name', 'offset', 'offsetHours', 'abbr', 'abbrName', 'isDst']

#### timezones_dim

In [66]:
timezones_dim = timezones_df \
                            .withColumn('timezoneId', row_number().over( Window.orderBy('offset')) ) \
                            .select(['timezoneId', 'name', 'offset', 'abbr', 'isDst']) \
                            .withColumnRenamed('name', 'timezoneName')


timezones_dim.show()

+----------+------------------+------+----+-----+
|timezoneId|      timezoneName|offset|abbr|isDst|
+----------+------------------+------+----+-----+
|         1|      Africa/Dakar|  NULL| GMT|false|
|         2|    Africa/Abidjan|  NULL| GMT|false|
|         3|      Africa/Accra|  NULL| GMT|false|
|         4|   Atlantic/Azores|  NULL| +00| true|
|         5|Atlantic/Reykjavik|  NULL| GMT|false|
|         6|    Africa/Conakry|  NULL| GMT|false|
|         7|     Africa/Bamako|  NULL| GMT|false|
|         8|               UTC|  NULL| UTC|false|
|         9| Africa/Nouakchott|  NULL| GMT|false|
|        10|Africa/Ouagadougou|  NULL| GMT|false|
|        11|   Africa/Freetown|  NULL| GMT|false|
|        12|       Africa/Lome|  NULL| GMT|false|
|        13|Atlantic/St_Helena|  NULL| GMT|false|
|        14|     Africa/Banjul|  NULL| GMT|false|
|        15|     Africa/Bissau|  NULL| GMT|false|
|        16|   Africa/Monrovia|  NULL| GMT|false|
|        17|   Africa/Sao_Tome|  NULL| GMT|false|


#### timezones_abbr_dim

In [67]:
timezones_abbr_dim = timezones_df \
                                    .select(['abbr', 'abbrName']) \
                                    .distinct() \
                                    .withColumn('abbrId', row_number().over( Window.orderBy('abbr'))) \
                                    .select(['abbrId', 'abbr', 'abbrName'])

timezones_abbr_dim.show()

+------+-----+--------+
|abbrId| abbr|abbrName|
+------+-----+--------+
|     1|  +00|    NULL|
|     2|  +01|    NULL|
|     3|  +03|    NULL|
|     4|+0330|    NULL|
|     5|  +04|    NULL|
|     6|+0430|    NULL|
|     7|  +05|    NULL|
|     8|+0530|    NULL|
|     9|+0545|    NULL|
|    10|  +06|    NULL|
|    11|+0630|    NULL|
|    12|  +07|    NULL|
|    13|  +08|    NULL|
|    14|+0845|    NULL|
|    15|  +09|    NULL|
|    16|  +10|    NULL|
|    17|+1030|    NULL|
|    18|  +11|    NULL|
|    19|  +12|    NULL|
|    20|+1245|    NULL|
+------+-----+--------+
only showing top 20 rows



#### Refactoring timezone_dim

In [68]:
# Replacing 'abbr' with 'abbrId' in timezone_dim

timezones_dim = timezones_dim \
                                .join(
                                    timezones_abbr_dim,
                                    timezones_dim.abbr == timezones_abbr_dim.abbr,
                                    'left'
                                    ) \
                                .select(['timezoneId', 'timezoneName', 'offset', 'isDst', 'abbrId'])

timezones_dim.show()

+----------+------------------+------+-----+------+
|timezoneId|      timezoneName|offset|isDst|abbrId|
+----------+------------------+------+-----+------+
|         1|      Africa/Dakar|  NULL|false|    52|
|         2|    Africa/Abidjan|  NULL|false|    52|
|         3|      Africa/Accra|  NULL|false|    52|
|         4|   Atlantic/Azores|  NULL| true|     1|
|         5|Atlantic/Reykjavik|  NULL|false|    52|
|         6|    Africa/Conakry|  NULL|false|    52|
|         7|     Africa/Bamako|  NULL|false|    52|
|         8|               UTC|  NULL|false|    71|
|         9| Africa/Nouakchott|  NULL|false|    52|
|        10|Africa/Ouagadougou|  NULL|false|    52|
|        11|   Africa/Freetown|  NULL|false|    52|
|        12|       Africa/Lome|  NULL|false|    52|
|        13|Atlantic/St_Helena|  NULL|false|    52|
|        14|     Africa/Banjul|  NULL|false|    52|
|        15|     Africa/Bissau|  NULL|false|    52|
|        16|   Africa/Monrovia|  NULL|false|    52|
|        17|

## Geographic information

### Creating countries_dim table

In [69]:
# Getting set of unique countries name
countries_set = { airport['country'] for airport in airports_data}

In [70]:
# Convert set in DataFrame
# Rename 'value' col in 'countryName'
# Add 'abbrId'
# Change cols order
countries_dim = spark \
                    .createDataFrame(countries_set, StringType()) \
                    .withColumnRenamed('value', 'country') \
                    .withColumn('countryId', row_number().over( Window.orderBy('country'))) \
                    .select(["countryId", "country"])

countries_dim.show()

+---------+-------------------+
|countryId|            country|
+---------+-------------------+
|        1|        Afghanistan|
|        2|            Albania|
|        3|            Algeria|
|        4|     American Samoa|
|        5|             Angola|
|        6|           Anguilla|
|        7|         Antarctica|
|        8|Antigua And Barbuda|
|        9|          Argentina|
|       10|            Armenia|
|       11|              Aruba|
|       12|          Australia|
|       13|            Austria|
|       14|         Azerbaijan|
|       15|            Bahamas|
|       16|            Bahrain|
|       17|         Bangladesh|
|       18|           Barbados|
|       19|            Belarus|
|       20|            Belgium|
+---------+-------------------+
only showing top 20 rows



### Creating cities_dim table

In [71]:
# Create set of unique city-country pairs

cities_set = { (airport['city'], airport['country']) for airport in airports_data}

In [72]:
# Converting set in DataFrame
cities_dim = spark \
              .createDataFrame(cities_set, schema = ["city", "country"]) \
              .withColumn('cityId', row_number().over( Window.orderBy('city')))

cities_dim.show()

+---------------+--------------------+------+
|           city|             country|cityId|
+---------------+--------------------+------+
|       A Coruna|               Spain|     1|
|         Aachen|             Germany|     2|
|        Aalborg|             Denmark|     3|
|         Aarhus|             Denmark|     4|
|        Aasiaat|           Greenland|     5|
|         Abadan|                Iran|     6|
|         Abakan|              Russia|     7|
|     Abbotsford|              Canada|     8|
|  Abemama Atoll|            Kiribati|     9|
|       Aberdeen|      United Kingdom|    10|
|       Aberdeen|       United States|    11|
|           Abha|        Saudi Arabia|    12|
|        Abidjan|         Ivory Coast|    13|
|        Abilene|       United States|    14|
|       Abingdon|       United States|    15|
|      Abu Dhabi|United Arab Emirates|    16|
|Abu Musa Island|                Iran|    17|
|     Abu Simbel|               Egypt|    18|
|          Abuja|             Nige

In [73]:
# Replacing 'country' with 'countryId'

cities_dim = cities_dim \
                      .join(
                          countries_dim,
                          'country',
                          'left'
                      ) \
                      .select( ['cityId', 'city', 'countryId'])

cities_dim.show()

+------+---------------+---------+
|cityId|           city|countryId|
+------+---------------+---------+
|     7|         Abakan|      170|
|     9|  Abemama Atoll|      109|
|     2|         Aachen|       76|
|    13|        Abidjan|      102|
|    21|          Accra|       77|
|    11|       Aberdeen|      216|
|    14|        Abilene|      216|
|    15|       Abingdon|      216|
|    19|          Abuja|      151|
|     1|       A Coruna|      192|
|     3|        Aalborg|       54|
|     4|         Aarhus|       54|
|     6|         Abadan|       96|
|    17|Abu Musa Island|       96|
|    20|       Acapulco|      133|
|    12|           Abha|      179|
|    16|      Abu Dhabi|      214|
|     8|     Abbotsford|       35|
|     5|        Aasiaat|       80|
|    18|     Abu Simbel|       59|
+------+---------------+---------+
only showing top 20 rows



### Creating airports_dim



In [74]:
def airport_typecaster(key, value):

  # Keep only the name of the timezone
  if key == 'timezone':
    return value['name']

  # Some 'lat' e 'lon' values are encoded as Float and some as Int,
  # so we have to convert all of them to Float
  if key in ('lat', 'lon'):
    return float(value)

  # Some values are encoded as String, so they have to be encoded as Int
  if key == 'alt':
    return int(value)

  return value

In [75]:
airports_col_to_remove = ["country"]

airports_data_typecasted = [
                            {key: airport_typecaster(key, value) for key, value in airport.items() if key not in airports_col_to_remove }
                             for airport in airports_data
                            ]

In [76]:
airport_rows = [Row(**airport) for airport in airports_data_typecasted]

airport_rows[:4]

[Row(id=1900, name='A Coruna Airport', iata='LCG', icao='LECO', city='A Coruna', lat=43.302059, lon=-8.37725, alt=326, size=4686, timezone='Europe/Madrid', countryId=209),
 Row(id=3, name='Aachen Merzbruck Airport', iata='AAH', icao='EDKA', city='Aachen', lat=50.821899, lon=6.184759, alt=626, size=1491, timezone='Europe/Berlin', countryId=83),
 Row(id=4, name='Aalborg Airport', iata='AAL', icao='EKYT', city='Aalborg', lat=57.095112, lon=9.855172, alt=3, size=8856, timezone='Europe/Copenhagen', countryId=61),
 Row(id=9, name='Aarhus Airport', iata='AAR', icao='EKAH', city='Aarhus', lat=56.303295, lon=10.619129, alt=71, size=3919, timezone='Europe/Copenhagen', countryId=61)]

In [77]:
airport_schema = StructType([
    StructField('id', IntegerType()),
    StructField('airportName', StringType()),
    StructField('iata', StringType()),
    StructField('icao', StringType()),
    StructField('city', StringType()),
    StructField('lat', DoubleType()),
    StructField('lon', DoubleType()),
    StructField('alt', IntegerType()),
    StructField('size', IntegerType()),
    StructField('timezoneName', StringType()),
    StructField('countryId', IntegerType()),
])

In [78]:
airport_df = spark.createDataFrame(airport_rows, airport_schema)

airport_df.show()

+----+--------------------+----+----+---------------+---------+-----------+----+-----+-----------------+---------+
|  id|         airportName|iata|icao|           city|      lat|        lon| alt| size|     timezoneName|countryId|
+----+--------------------+----+----+---------------+---------+-----------+----+-----+-----------------+---------+
|1900|    A Coruna Airport| LCG|LECO|       A Coruna|43.302059|   -8.37725| 326| 4686|    Europe/Madrid|      209|
|   3|Aachen Merzbruck ...| AAH|EDKA|         Aachen|50.821899|   6.184759| 626| 1491|    Europe/Berlin|       83|
|   4|     Aalborg Airport| AAL|EKYT|        Aalborg|57.095112|   9.855172|   3| 8856|Europe/Copenhagen|       61|
|   9|      Aarhus Airport| AAR|EKAH|         Aarhus|56.303295|  10.619129|  71| 3919|Europe/Copenhagen|       61|
|7569|  Aarhus Sea Airport| QEA|EKAC|         Aarhus|56.151993|  10.247725|   1|  139|Europe/Copenhagen|       61|
|1596|     Aasiaat Airport| JEG|BGAA|        Aasiaat| 68.72184| -52.784698|  74|

In [79]:
cities_dim.show()

+------+---------------+---------+
|cityId|           city|countryId|
+------+---------------+---------+
|     7|         Abakan|      170|
|     9|  Abemama Atoll|      109|
|     2|         Aachen|       76|
|    13|        Abidjan|      102|
|    21|          Accra|       77|
|    11|       Aberdeen|      216|
|    14|        Abilene|      216|
|    15|       Abingdon|      216|
|    19|          Abuja|      151|
|     1|       A Coruna|      192|
|     3|        Aalborg|       54|
|     4|         Aarhus|       54|
|     6|         Abadan|       96|
|    17|Abu Musa Island|       96|
|    20|       Acapulco|      133|
|    12|           Abha|      179|
|    16|      Abu Dhabi|      214|
|     8|     Abbotsford|       35|
|     5|        Aasiaat|       80|
|    18|     Abu Simbel|       59|
+------+---------------+---------+
only showing top 20 rows



### Creating airport_dim

In [80]:
airport_df.show()

+----+--------------------+----+----+---------------+---------+-----------+----+-----+-----------------+---------+
|  id|         airportName|iata|icao|           city|      lat|        lon| alt| size|     timezoneName|countryId|
+----+--------------------+----+----+---------------+---------+-----------+----+-----+-----------------+---------+
|1900|    A Coruna Airport| LCG|LECO|       A Coruna|43.302059|   -8.37725| 326| 4686|    Europe/Madrid|      209|
|   3|Aachen Merzbruck ...| AAH|EDKA|         Aachen|50.821899|   6.184759| 626| 1491|    Europe/Berlin|       83|
|   4|     Aalborg Airport| AAL|EKYT|        Aalborg|57.095112|   9.855172|   3| 8856|Europe/Copenhagen|       61|
|   9|      Aarhus Airport| AAR|EKAH|         Aarhus|56.303295|  10.619129|  71| 3919|Europe/Copenhagen|       61|
|7569|  Aarhus Sea Airport| QEA|EKAC|         Aarhus|56.151993|  10.247725|   1|  139|Europe/Copenhagen|       61|
|1596|     Aasiaat Airport| JEG|BGAA|        Aasiaat| 68.72184| -52.784698|  74|

In [81]:
airport_dim = airport_df \
                            .join( # Replace 'timezone' with 'timezoneId'
                                timezones_dim,
                                'timezoneName',
                                'left'
                              ) \
                            .join( # Replace 'city' with 'cityId'
                                cities_dim,
                                airport_df.city == cities_dim.city,
                                'left'
                              ) \
                            .select(
                                [ 'id', 'airportName', 'iata', 'icao', 'lat', 'lon', 'size', 'cityId', 'timezoneId' ]
                            )

airport_dim.show()

+----+--------------------+----+----+---------+-----------+-----+------+----------+
|  id|         airportName|iata|icao|      lat|        lon| size|cityId|timezoneId|
+----+--------------------+----+----+---------+-----------+-----+------+----------+
|4206|      Masset Airport| ZMT|CZMT|    54.02|-132.119995| 1334|  2851|        32|
|7633|      Maxton Airport| MXE|KMEB|34.792423| -79.369392|  435|  2870|       104|
|  21|  Abu Simbel Airport| ABS|HEBL| 22.37595|  31.611719|   70|    18|       264|
|2094|Matamoros Interna...| MAM|MMMA| 25.76989| -97.525299| 1248|  2857|        67|
|6909|Matak Tarempa Air...| MWK|WIOM|  3.34812| 106.258003|    3|  2856|       306|
|7403|Abingdon Virginia...| VJI|KVJI|36.686111| -82.033333| 1778|    15|       104|
|2273|Maumere Frans Sed...| MOF|WATC| -8.64064| 122.236801|  883|  2866|       319|
|  11|Abakan Internatio...| ABA|UNAA|53.740002|  91.385002| 2146|     7|       308|
|  16|Abidjan Port Boue...| ABJ|DIAP| 5.261386|   -3.92629|11907|    13|    

# Aircraft information

## Importing JSON

In [82]:
aircraft_file = 'aircraft_data.json'
aircraft_url = "https://flight-radar1.p.rapidapi.com/aircrafts/list"

aircraft_data = getData(folder + aircraft_file, aircraft_url, headers=headers)

## Saving aircarft info in a DataFrame

In [83]:
aircraft_rows = []

for family in aircraft_data:
  for model in family['models']:
      model['family'] = family['description']
      aircraft_rows.append(Row(**model))

In [84]:
aircraft_df = spark.createDataFrame(aircraft_rows, schema = ['aircraftName', 'code', 'family'])

aircraft_df.show()

+----------------+----+------------------+
|    aircraftName|code|            family|
+----------------+----+------------------+
| Airbus A220-100|BCS1|Airbus A220 family|
| Airbus A220-300|BCS3|Airbus A220 family|
|     Airbus A300|A30B|Airbus A300 family|
| Airbus A300-600|A306|Airbus A300 family|
|     Airbus A310|A310|Airbus A310 family|
|     Airbus A318|A318|Airbus A320 family|
|     Airbus A319|A319|Airbus A320 family|
|  Airbus A319neo|A19N|Airbus A320 family|
|     Airbus A320|A320|Airbus A320 family|
|  Airbus A320neo|A20N|Airbus A320 family|
|     Airbus A321|A321|Airbus A320 family|
|  Airbus A321neo|A21N|Airbus A320 family|
| Airbus A330-200|A332|Airbus A330 family|
| Airbus A330-300|A333|Airbus A330 family|
| Airbus A330-900|A339|Airbus A330 family|
| Airbus A340-200|A342|Airbus A340 family|
| Airbus A340-300|A343|Airbus A340 family|
| Airbus A340-500|A345|Airbus A340 family|
| Airbus A340-600|A346|Airbus A340 family|
|Airbus A350-1000|A35K|Airbus A350 family|
+----------

In [85]:
aircraft_df.groupBy("family").count().sort("count", ascending=False).show()

+--------------------+-----+
|              family|count|
+--------------------+-----+
|   Boeing 737 family|   11|
|  Airbus A320 family|    7|
|   Boeing 747 family|    5|
|Bombardier CRJ fa...|    5|
|McDonnell Douglas...|    5|
|Embraer E-Jet family|    5|
|  Airbus A340 family|    4|
|   Boeing 777 family|    4|
|Bombardier Dash 8...|    4|
|  Airbus A330 family|    3|
|      Avro RJ family|    3|
|       ATR 42 family|    3|
|       ATR 72 family|    3|
|       Fokker family|    3|
|   Boeing 787 family|    3|
|   Boeing 767 family|    3|
|  Airbus A220 family|    2|
|  Airbus A350 family|    2|
|  Airbus A300 family|    2|
|   Boeing 757 family|    2|
+--------------------+-----+
only showing top 20 rows



## Creating aircraft_family_dim

In [86]:
aircraft_family_dim = aircraft_df \
                                  .select('family') \
                                  .distinct() \
                                  .withColumn('familyId', row_number().over( Window.orderBy('family') ))

aircraft_family_dim.show()

+--------------------+--------+
|              family|familyId|
+--------------------+--------+
|       ATR 42 family|       1|
|       ATR 72 family|       2|
|  Airbus A220 family|       3|
|  Airbus A300 family|       4|
|  Airbus A310 family|       5|
|  Airbus A320 family|       6|
|  Airbus A330 family|       7|
|  Airbus A340 family|       8|
|  Airbus A350 family|       9|
|     Airbus A380-800|      10|
|      Avro RJ family|      11|
|   Boeing 737 family|      12|
|   Boeing 747 family|      13|
|   Boeing 757 family|      14|
|   Boeing 767 family|      15|
|   Boeing 777 family|      16|
|   Boeing 787 family|      17|
|Bombardier CRJ fa...|      18|
|Bombardier Dash 8...|      19|
|Embraer E-Jet family|      20|
+--------------------+--------+
only showing top 20 rows



## Creating aircraft_dim

In [87]:
aircraft_dim = aircraft_df \
                          .join(
                              aircraft_family_dim,
                              'family',
                              'left'
                          ) \
                          .select( [ 'aircraftName', 'code', 'familyId']) \

aircraft_dim.show()

+----------------+----+--------+
|    aircraftName|code|familyId|
+----------------+----+--------+
| Airbus A220-100|BCS1|       3|
| Airbus A220-300|BCS3|       3|
| Airbus A330-200|A332|       7|
| Airbus A330-300|A333|       7|
| Airbus A330-900|A339|       7|
| Airbus A340-200|A342|       8|
| Airbus A340-300|A343|       8|
| Airbus A340-500|A345|       8|
| Airbus A340-600|A346|       8|
|     Airbus A310|A310|       5|
|Airbus A350-1000|A35K|       9|
| Airbus A350-900|A359|       9|
|     Airbus A300|A30B|       4|
| Airbus A300-600|A306|       4|
|     Airbus A318|A318|       6|
|     Airbus A319|A319|       6|
|  Airbus A319neo|A19N|       6|
|     Airbus A320|A320|       6|
|  Airbus A320neo|A20N|       6|
|     Airbus A321|A321|       6|
+----------------+----+--------+
only showing top 20 rows



# Flights

##  Most tracked

### Importing JSON

In [115]:
def get_flight_data(folder, url, headers):

    response = requests.get(url, headers=headers)

    print(response.json())

    data = response.json()["data"]

    file_name = folder + "flight" + str(response.json()["update_time"]) + ".json"

    with open(file_name, 'w') as f:
      json.dump(data, f)

    return(data)

In [146]:
flight_url = "https://flight-radar1.p.rapidapi.com/flights/list-most-tracked"
flight_folder = "/content/drive/MyDrive/portfolio/spark_flight_data/flight/"

flight_data = get_flight_data(flight_folder, flight_url, headers)

{'version': '0.3.9', 'update_time': 1721863313.602942, 'data': [{'flight_id': '364fbb07', 'flight': None, 'callsign': 'SNBD01', 'squawk': None, 'clicks': 1276, 'from_iata': 'DLH', 'from_city': 'Duluth', 'to_iata': None, 'to_city': None, 'model': 'CL41', 'type': 'Canadair CT-114 Tutor'}, {'flight_id': '364f9cdd', 'flight': None, 'callsign': 'N529B', 'squawk': None, 'clicks': 720, 'from_iata': 'OSH', 'from_city': 'Oshkosh', 'to_iata': None, 'to_city': None, 'model': 'B29', 'type': 'Boeing B-29A Superfortress'}, {'flight_id': '364fd7d6', 'flight': 'JX701', 'callsign': 'SJX701', 'squawk': None, 'clicks': 456, 'from_iata': 'TPE', 'from_city': 'Taipei', 'to_iata': 'DAD', 'to_city': 'Da Nang', 'model': 'A339', 'type': 'Airbus A330-941'}, {'flight_id': '364fddf1', 'flight': 'IT246', 'callsign': 'TTW246', 'squawk': None, 'clicks': 404, 'from_iata': 'TPE', 'from_city': 'Taipei', 'to_iata': 'HSG', 'to_city': 'Saga', 'model': 'A20N', 'type': 'Airbus A320-271N'}, {'flight_id': '364fdf6d', 'flight':

In [145]:
flight_data

[{'flight_id': '364fbb07',
  'flight': None,
  'callsign': 'SNBD01',
  'squawk': None,
  'clicks': 1264,
  'from_iata': 'DLH',
  'from_city': 'Duluth',
  'to_iata': None,
  'to_city': None,
  'model': 'CL41',
  'type': 'Canadair CT-114 Tutor'},
 {'flight_id': '364f9cdd',
  'flight': None,
  'callsign': 'N529B',
  'squawk': None,
  'clicks': 812,
  'from_iata': 'OSH',
  'from_city': 'Oshkosh',
  'to_iata': None,
  'to_city': None,
  'model': 'B29',
  'type': 'Boeing B-29A Superfortress'},
 {'flight_id': '364fd7d6',
  'flight': 'JX701',
  'callsign': 'SJX701',
  'squawk': None,
  'clicks': 508,
  'from_iata': 'TPE',
  'from_city': 'Taipei',
  'to_iata': 'DAD',
  'to_city': 'Da Nang',
  'model': 'A339',
  'type': 'Airbus A330-941'},
 {'flight_id': '364fddf1',
  'flight': 'IT246',
  'callsign': 'TTW246',
  'squawk': None,
  'clicks': 388,
  'from_iata': 'TPE',
  'from_city': 'Taipei',
  'to_iata': 'HSG',
  'to_city': 'Saga',
  'model': 'A20N',
  'type': 'Airbus A320-271N'},
 {'flight_id': 

### Saving info in a DataFrame

In [147]:
# Converting list of Dictionaries in a list of Rows
fligth_rows = [Row(**flight) for flight in flight_data]

In [148]:
# Defining schema
flight_schema = StructType([
    StructField('flightId', StringType()),
    StructField('flight', StringType()),
    StructField('callsign', StringType()),
    StructField('squawk', StringType()),
    StructField('clicks', IntegerType()),
    StructField('from_iata', StringType()),
    StructField('from_city', StringType()),
    StructField('to_iata', StringType()),
    StructField('to_city', StringType()),
    StructField('model', StringType()),
    StructField('type', StringType())
])

In [149]:
flight_data = spark.read.json(flight_folder + "flight*.json")

flight_data.show()

+--------+------+------+---------+-----------+---------+-----+------+--------+-------+--------------------+
|callsign|clicks|flight|flight_id|  from_city|from_iata|model|squawk| to_city|to_iata|                type|
+--------+------+------+---------+-----------+---------+-----+------+--------+-------+--------------------+
|  CFR230|   440|CFR230| 364fb83b|       NULL|     NULL|  V10|  NULL|    NULL|   NULL|North American OV...|
|   CFR88|   352| CFR88| 364f9c6b|       NULL|     NULL|  S2T|  NULL|    NULL|   NULL|Marsh S-2F3AT Tur...|
| ABD7743|   268|CC7743| 364e1732|  Frankfurt|      HHN| B744|  NULL| Chicago|    RFD| Boeing 747-481(BCF)|
|   CFR91|   264| CFR91| 364fd229|      Chico|      CIC|  S2T|  NULL|    NULL|   NULL|Marsh S-2F3AT Tur...|
|   CFR89|   260| CFR89| 364fcf58|      Chico|      CIC|  S2T|  NULL|    NULL|   NULL|Marsh S-2F3AT Tur...|
|  ASL135|   200| JU135| 364f809b|     Moscow|      SVO| A320|  NULL|Belgrade|    BEG|     Airbus A320-232|
|   CFR93|   156| CFR93| 364

In [151]:
flight_data.count()

40

In [150]:
fligth_df = flight_data \
                .select( # Removing 'from_city' and 'to_city' since can be retreved using 'from_iata' e 'to_iata'.
                         # Removing aircraft 'type' since can be retreaved using 'model'
                    [ 'flight_id', 'flight', 'callsign', 'squawk', 'clicks', 'from_iata', 'to_iata', 'model']
                ) \
                .withColumn( # COunt how many times the same id appears
                    "isDuplicate", row_number().over( Window.orderBy("clicks").partitionBy("flight_id") )
                ) \
                .filter( # Drop all the outdated rows
                    col("isDuplicate") == 1
                ) \
                .drop( # Drop 'isDuplicate', since no longer needed
                    "isDuplicate"
                )

fligth_df.show()

+---------+------+--------+------+------+---------+-------+-----+
|flight_id|flight|callsign|squawk|clicks|from_iata|to_iata|model|
+---------+------+--------+------+------+---------+-------+-----+
| 364e1732|CC7743| ABD7743|  NULL|   268|      HHN|    RFD| B744|
| 364f468b| AI984|  AIC984|  NULL|   152|      DXB|    BOM| A359|
| 364f809b| JU135|  ASL135|  NULL|   200|      SVO|    BEG| A320|
| 364f9c6b| CFR88|   CFR88|  NULL|   160|     NULL|   NULL|  S2T|
| 364f9cdd|  NULL|   N529B|  NULL|   720|      OSH|   NULL|  B29|
| 364fb83b|CFR230|  CFR230|  NULL|   436|     NULL|   NULL|  V10|
| 364fbb07|  NULL|  SNBD01|  NULL|  1264|      DLH|   NULL| CL41|
| 364fc75f|  NULL|  N10601|  NULL|   112|      FLD|   NULL|  P51|
| 364fc8f6|  NULL|   N144P|  NULL|   116|      OSH|   NULL| RV14|
| 364fca33| IT654|  TTW654|  NULL|   152|      TPE|    CJU| A320|
| 364fcaef| IT230|  TTW230|  NULL|   172|      TPE|    OKA| A320|
| 364fcdf7|  NULL|   N10BA|  NULL|   124|      OSH|   NULL| CC11|
| 364fcf0e

In [152]:
fligth_df.count()

24