In [0]:
%sh
cd /Volumes/workspace/aerospace/flights/
unzip flights.zip
ls -al

In [0]:
# File location and type
file_location = "/Volumes/workspace/aerospace/flights/flights.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

In [0]:
#df.createOrReplaceTempView("flights")
df.write.saveAsTable('aerospace.flights')


In [0]:
%sql
select * from `flights`

In [0]:
%sql
select * from aerospace.airlines

TODO:

1. Show the top 20 Shortest flights by airtime
2. Show this with the full name of the airports from where (Origin) and to where (Dest) these flights are realized
3. Add also Latitude and Longitude and then visualize it on the map
4. Count number of all planes
5. Create a list of carriers (Full name of airlines) with their average arrival delay and number of realized flights. Visualize these delays

In [0]:
%sql
-- show tables
show tables from aerospace

In [0]:

%sql
select AirTime from flights

In [0]:
%sql
select Origin, Dest, origin_airports.AIRPORT as `Origin Airport`, dest_airports.AIRPORT as `Dest Airport` from aerospace.flights join aerospace.airports as `origin_airports` on flights.Origin = origin_airports.IATA_CODE join aerospace.airports as `dest_airports` on flights.Dest = dest_airports.IATA_CODE 

In [0]:
%sql
create or replace temporary view `shortest_flights` as select DISTINCT
  Origin, 
  Dest, 
  AirTime,
  origin_airports.AIRPORT as `Origin Airport`, 
  origin_airports.LATITUDE as `Origin Latitude`, 
  origin_airports.LONGITUDE as `Origin Longitude`, 
  origin_airports.IATA_CODE as `Origin IATA Code`,
  dest_airports.AIRPORT as `Dest Airport`,
  dest_airports.LATITUDE as `Dest Latitude`, 
  dest_airports.LONGITUDE as `Dest Longitude`,  
  dest_airports.IATA_CODE as `Dest IATA Code`
from aerospace.flights 
join 
aerospace.airports as `origin_airports` on flights.Origin = origin_airports.IATA_CODE join 
aerospace.airports as `dest_airports` on flights.Dest = dest_airports.IATA_CODE 
where AirTime is not null
order by AirTime asc
limit 20

In [0]:
%sql
select distinct `Origin IATA Code` as IATA_CODE, `Origin Latitude` as Latitude, `Origin Longitude` as Longitude from `shortest_flights`

In [0]:
%sql
select `Origin IATA Code` as IATA_CODE, `Origin Latitude` as Latitude, `Origin Longitude` as Longitude from `shortest_flights` UNION select `Dest IATA Code` as IATA_CODE, `Dest Latitude` as Latitude, `Dest Longitude` as Longitude from `shortest_flights`

In [0]:
%sql
select * from aerospace.flights where Carrier != UniqueCarrier

In [0]:
%sql
select distinct tailnum from aerospace.flights where tailnum is not null

In [0]:
%sql
select * from aerospace.airlines

In [0]:
%sql
-- Create a list of carriers (Full name of airlines) with their average arrival delay and number of realized flights. Visualize these delays

select Carrier, ar.AIRLINE, avg(ArrDelay), count(Carrier)  from aerospace.flights join aerospace.airlines as ar on flights.Carrier = ar.IATA_CODE group by Carrier, ar.AIRLINE order by avg(ArrDelay)

In [0]:
%sql
select * from aerospace.flights

In [0]:
df = _sqldf
frames = [round(0.05*x, 2) for x in range (1,21)]
quant_arrdelay = df.approxQuantile(["ArrDelay"], frames, 0.0)
print(quant_arrdelay)
quant_delays = [round(x, 2) for x in quant_arrdelay[0]]
quant_matrix = list(zip(frames, quant_delays))
print(quant_matrix)
df_quant_matrix = spark.createDataFrame(quant_matrix, ["Percentile", "ArrDelay"])
display(df_quant_matrix)