In [1]:
sc

In [2]:
spark

### 1. Create a new Spark session instance

In [3]:
sc.stop()
spark.stop()

In [4]:
from pyspark import SparkConf, SparkContext
# setMaster sets spark ContextManager which is loca[cpu cores]
config = SparkConf().setMaster('local[4]').setAppName("PySparkSession")
sc = SparkContext(conf=config)

In [5]:
sc

In [6]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SQLSession").getOrCreate()

In [7]:
spark

In [8]:
!hdfs dfs -mkdir /flights

In [9]:
!hdfs dfs -copyFromLocal /home/hadoop/Downloads/raw_flight_data.csv /flights

In [44]:
flights_df = spark.read.csv("hdfs://localhost:9000/flights/raw_flight_data.csv", 
                            header=True, inferSchema=True)

In [14]:
flights_df.head(3)

[Row(DayofMonth=19, DayOfWeek=5, Carrier='DL', OriginAirportID=11433, DestAirportID=13303, DepDelay=-3, ArrDelay=1),
 Row(DayofMonth=19, DayOfWeek=5, Carrier='DL', OriginAirportID=14869, DestAirportID=12478, DepDelay=0, ArrDelay=-8),
 Row(DayofMonth=19, DayOfWeek=5, Carrier='DL', OriginAirportID=14057, DestAirportID=14869, DepDelay=-4, ArrDelay=-15)]

In [15]:
flights_df.show()     # show data in a tabular format

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          11193|        12892|      -6|     -11|
|        19|        5|     DL|          10397|        15016|      -1|     -19|
|        19|        5|     DL|          15016|        10397|       0|      -1|
|        19|        5|     DL|          10397|        14869|      15|      24|
|        19|        5|     DL|          10397|        10423|      33|      34|
|        19|        5|     DL|          11278|      

In [16]:
# select is used to select some specific column from the DataFrame using column names

flights_df.select(['DayofMonth', 'Carrier', 'OriginAirportID', 'DepDelay']).show()

+----------+-------+---------------+--------+
|DayofMonth|Carrier|OriginAirportID|DepDelay|
+----------+-------+---------------+--------+
|        19|     DL|          11433|      -3|
|        19|     DL|          14869|       0|
|        19|     DL|          14057|      -4|
|        19|     DL|          15016|      28|
|        19|     DL|          11193|      -6|
|        19|     DL|          10397|      -1|
|        19|     DL|          15016|       0|
|        19|     DL|          10397|      15|
|        19|     DL|          10397|      33|
|        19|     DL|          11278|     323|
|        19|     DL|          14107|      -7|
|        19|     DL|          11433|      22|
|        19|     DL|          11298|      40|
|        19|     DL|          11433|      -2|
|        19|     DL|          10397|      71|
|        19|     DL|          12451|      75|
|        19|     DL|          12953|      -1|
|        19|     DL|          11433|      -3|
|        19|     DL|          1039

In [17]:
#distinct() = show varous distinct values

flights_df.select(['Carrier']).distinct().show()

+-------+
|Carrier|
+-------+
|     UA|
|     AA|
|     EV|
|     B6|
|     DL|
|     OO|
|     F9|
|     YV|
|     US|
|     MQ|
|     HA|
|     AS|
|     FL|
|     VX|
|     WN|
|     9E|
+-------+



In [19]:
# show all columns in the DataFrame
flights_df.columns

['DayofMonth',
 'DayOfWeek',
 'Carrier',
 'OriginAirportID',
 'DestAirportID',
 'DepDelay',
 'ArrDelay']

In [21]:
# total number of rows

flights_df.count()

2719418

In [22]:
# show total count of distinct carriers
flights_df.select(['Carrier']).distinct().count()

16

In [25]:
# 3. printSchema - shows schema anem and schema datatypes

flights_df.printSchema()

root
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Carrier: string (nullable = true)
 |-- OriginAirportID: integer (nullable = true)
 |-- DestAirportID: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- ArrDelay: integer (nullable = true)



In [26]:
# 4. where - similar to filter in RDD, with boolean condition and statement

flights_df.where(flights_df.DepDelay > 0).first()

Row(DayofMonth=19, DayOfWeek=5, Carrier='DL', OriginAirportID=15016, DestAirportID=11433, DepDelay=28, ArrDelay=24)

In [28]:
flights_df.where(flights_df.DepDelay > 0).show()

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          10397|        14869|      15|      24|
|        19|        5|     DL|          10397|        10423|      33|      34|
|        19|        5|     DL|          11278|        10397|     323|     322|
|        19|        5|     DL|          11433|        11298|      22|      41|
|        19|        5|     DL|          11298|        11433|      40|      20|
|        19|        5|     DL|          10397|        12451|      71|      75|
|        19|        5|     DL|          12451|        10397|      75|      57|
|        19|        5|     DL|          10397|        14771|      31|      38|
|        19|        5|     DL|          13204|      

In [29]:
flights_df.where((flights_df.DepDelay>0) & (flights_df.ArrDelay>0)).show()

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          10397|        14869|      15|      24|
|        19|        5|     DL|          10397|        10423|      33|      34|
|        19|        5|     DL|          11278|        10397|     323|     322|
|        19|        5|     DL|          11433|        11298|      22|      41|
|        19|        5|     DL|          11298|        11433|      40|      20|
|        19|        5|     DL|          10397|        12451|      71|      75|
|        19|        5|     DL|          12451|        10397|      75|      57|
|        19|        5|     DL|          10397|        14771|      31|      38|
|        19|        5|     DL|          13204|      

In [30]:
# 5, filter() - filter outputs using condiiton
flights_df.filter((flights_df.DepDelay>0) & (flights_df.ArrDelay>0)).show()

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          10397|        14869|      15|      24|
|        19|        5|     DL|          10397|        10423|      33|      34|
|        19|        5|     DL|          11278|        10397|     323|     322|
|        19|        5|     DL|          11433|        11298|      22|      41|
|        19|        5|     DL|          11298|        11433|      40|      20|
|        19|        5|     DL|          10397|        12451|      71|      75|
|        19|        5|     DL|          12451|        10397|      75|      57|
|        19|        5|     DL|          10397|        14771|      31|      38|
|        19|        5|     DL|          13204|      

In [31]:
# 6. isin - filters values from DataFrame by matching Pattern
flights_df.Carrier.isin('DL', 'F9', 'UA', '9E')

Column<b'(Carrier IN (DL, F9, UA, 9E))'>

In [32]:
flights_df.where(flights_df.Carrier.isin('DL', 'F9', 'UA', '9E')).show()

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          11193|        12892|      -6|     -11|
|        19|        5|     DL|          10397|        15016|      -1|     -19|
|        19|        5|     DL|          15016|        10397|       0|      -1|
|        19|        5|     DL|          10397|        14869|      15|      24|
|        19|        5|     DL|          10397|        10423|      33|      34|
|        19|        5|     DL|          11278|      

In [35]:
# 7. Read airports.csv file as Spark DataFrame
airports_df = spark.read.csv('file:///home/hadoop/Downloads/airports.csv', header=True, inferSchema=True)

In [36]:
airports_df.show()

+----------+-----------+-----+--------------------+
|airport_id|       city|state|                name|
+----------+-----------+-----+--------------------+
|     10165|Adak Island|   AK|                Adak|
|     10299|  Anchorage|   AK|Ted Stevens Ancho...|
|     10304|      Aniak|   AK|       Aniak Airport|
|     10754|     Barrow|   AK|Wiley Post/Will R...|
|     10551|     Bethel|   AK|      Bethel Airport|
|     10926|    Cordova|   AK|Merle K Mudhole S...|
|     14709|  Deadhorse|   AK|   Deadhorse Airport|
|     11336| Dillingham|   AK|  Dillingham Airport|
|     11630|  Fairbanks|   AK|Fairbanks Interna...|
|     11997|   Gustavus|   AK|    Gustavus Airport|
|     12523|     Juneau|   AK|Juneau International|
|     12819|  Ketchikan|   AK|Ketchikan Interna...|
|     10245|King Salmon|   AK| King Salmon Airport|
|     10170|     Kodiak|   AK|      Kodiak Airport|
|     13970|   Kotzebue|   AK| Ralph Wien Memorial|
|     13873|       Nome|   AK|        Nome Airport|
|     14256|

In [38]:
# 8. Join() - to join 2 or more DFs using condition.
flights_airportDF = flights_df.join(airports_df, flights_df.OriginAirportID==airports_df.airport_id)

In [43]:
flights_airportDF.show()

+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|airport_id|                name|
+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------------+
|        19|        5|     DL|          11433|        13303|      -3|       1|     11433|Detroit Metro Way...|
|        19|        5|     DL|          14869|        12478|       0|      -8|     14869|Salt Lake City In...|
|        19|        5|     DL|          14057|        14869|      -4|     -15|     14057|Portland Internat...|
|        19|        5|     DL|          15016|        11433|      28|      24|     15016|Lambert-St. Louis...|
|        19|        5|     DL|          11193|        12892|      -6|     -11|     11193|Cincinnati/Northe...|
|        19|        5|     DL|          10397|        15016|      -1|     -19|     10397|Hartsfield-Jackso...|
|

In [41]:
flights_airportDF = flights_df.join(airports_df.select(['airport_id', 'name']), 
                                    flights_df.OriginAirportID==airports_df.airport_id)
flights_airportDF.show()

+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|airport_id|                name|
+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------------+
|        19|        5|     DL|          11433|        13303|      -3|       1|     11433|Detroit Metro Way...|
|        19|        5|     DL|          14869|        12478|       0|      -8|     14869|Salt Lake City In...|
|        19|        5|     DL|          14057|        14869|      -4|     -15|     14057|Portland Internat...|
|        19|        5|     DL|          15016|        11433|      28|      24|     15016|Lambert-St. Louis...|
|        19|        5|     DL|          11193|        12892|      -6|     -11|     11193|Cincinnati/Northe...|
|        19|        5|     DL|          10397|        15016|      -1|     -19|     10397|Hartsfield-Jackso...|
|

### 9. Drop duplicates
###### drop duplicate records form existing DataFrame

In [45]:

flights_df1 = flights_df.dropDuplicates()

In [49]:
# TASK: Calculate percentage data lost after DropDuplicates
retained = flights_df1.count()*100/flights_df.count()
print(f"Percent loss in data after dropping duplicates: {100 - retained}")

Percent loss in data after dropping duplicates: 0.8249927006440316


### 10. describe 
###### gives summary about numerical values in dataframe
###### includes count, mean, stddev, min, max

In [51]:
flights_df1.describe().show()

+-------+------------------+------------------+-------+------------------+------------------+------------------+------------------+
|summary|        DayofMonth|         DayOfWeek|Carrier|   OriginAirportID|     DestAirportID|          DepDelay|          ArrDelay|
+-------+------------------+------------------+-------+------------------+------------------+------------------+------------------+
|  count|           2696983|           2696983|2696983|           2696983|           2696983|           2674774|           2673185|
|   mean|15.798996508320593| 3.900369412784582|   null|12742.459424846207| 12742.85937657004|10.618575625454712|6.7272897311633875|
| stddev| 8.801267199135454|1.9864582421701973|   null|1502.0359941370625|1501.9939589817989|36.198308432512704| 38.75007476808384|
|    min|                 1|                 1|     9E|             10140|             10140|               -63|               -94|
|    max|                31|                 7|     YV|             15376|  

### 12. summary
###### use summary method for detailed summary of columns
###### shows count, mean, stddev, min, max, q1, q2, q3

In [52]:
flights_df1.summary().show()

+-------+------------------+------------------+-------+------------------+------------------+------------------+------------------+
|summary|        DayofMonth|         DayOfWeek|Carrier|   OriginAirportID|     DestAirportID|          DepDelay|          ArrDelay|
+-------+------------------+------------------+-------+------------------+------------------+------------------+------------------+
|  count|           2696983|           2696983|2696983|           2696983|           2696983|           2674774|           2673185|
|   mean|15.798996508320593| 3.900369412784582|   null|12742.459424846207| 12742.85937657004|10.618575625454712|6.7272897311633875|
| stddev| 8.801267199135454|1.9864582421701973|   null|1502.0359941370625|1501.9939589817989|36.198308432512704| 38.75007476808384|
|    min|                 1|                 1|     9E|             10140|             10140|               -63|               -94|
|    25%|                 8|                 2|   null|             11292|  

### 13. select categorical columns

In [54]:
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql.functions import *

In [56]:
categorical_columns = [field.name for field in flights_df1.schema.fields if isinstance(field.dataType, StringType)]

In [57]:
categorical_columns

['Carrier']

### 14. col, groupBy()
###### col function is used to refer to a column DataFrame
###### groupBy() - method to group rows of a DataFrame based on values of one or more columns

In [58]:
# col function is used to refer to a column in a DataType
# where to use it? in column aggregation

from pyspark.sql.functions import col

In [60]:
flights_df1.filter(col('ArrDelay')>10).count()

664460

In [61]:
for column in categorical_columns:
    flights_df1.groupBy(column).count().show()

+-------+------+
|Carrier| count|
+-------+------+
|     UA|286010|
|     AA|288910|
|     EV|157218|
|     B6|121875|
|     DL|381601|
|     OO|159639|
|     F9| 35736|
|     YV| 52740|
|     US|232955|
|     MQ|112113|
|     HA| 17424|
|     AS| 68544|
|     FL| 92674|
|     VX| 34726|
|     WN|575090|
|     9E| 79728|
+-------+------+



### 15. isNull()
###### returns boolean outcome for missing values

In [63]:
# check for nulls in a specific column
flights_df1.filter(col('ArrDelay').isNull()).show()

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        17|        3|     DL|          14869|        14771|    null|    null|
|        11|        4|     EV|          12266|        13871|    null|    null|
|        19|        5|     EV|          11618|        11433|    null|    null|
|        10|        3|     EV|          13930|        13851|    null|    null|
|         9|        2|     EV|          11292|        14107|    null|    null|
|        18|        4|     AA|          13303|        12892|    null|    null|
|        11|        4|     AA|          13930|        11292|    null|    null|
|        18|        4|     AA|          11298|        14107|    null|    null|
|        16|        2|     AA|          11278|        13930|    null|    null|
|        17|        3|     AA|          13930|      

In [80]:
# show number of missing values for one column
# returns a boolean value/boolean outcome
flights_df1.filter(col('ArrDelay').isNull()).count()

23798

In [95]:
flights_df1.select([count(when(isnull(column), column)).alias(column) for column in flights_df1.columns]).show()

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|         0|        0|      0|              0|            0|   22209|   23798|
+----------+---------+-------+---------------+-------------+--------+--------+



In [None]:
# alternative approach- use col() and isNull()

In [98]:
from pyspark.sql.functions import col
[{column: flights_df1.filter(col(column).isNull()).count()} for column in flights_df1.columns]

[{'DayofMonth': 0},
 {'DayOfWeek': 0},
 {'Carrier': 0},
 {'OriginAirportID': 0},
 {'DestAirportID': 0},
 {'DepDelay': 22209},
 {'ArrDelay': 23798}]

In [100]:
# better version of top query
flights_df1.select([sum(col(column).isNull().cast("int")).alias(column) for column in flights_df1.columns]).show()

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|         0|        0|      0|              0|            0|   22209|   23798|
+----------+---------+-------+---------------+-------------+--------+--------+



# 16. fillna()
###### replace of fill missinf values of column with central tendency (mean, median, mode)
###### here, will replace missing values by 0's

In [105]:
flights_df2 = flights_df1.fillna({'DepDelay':0, 'ArrDelay':0})
flights_df2.select([sum(col(column).isNull().cast("int")).alias(column) for column in flights_df2.columns]).show()

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|         0|        0|      0|              0|            0|       0|       0|
+----------+---------+-------+---------------+-------------+--------+--------+



# 17. dropna()
###### used to remove rows with null values

In [None]:
flights_df3 = flights_df1.dropna()

# 18. statistical methods
##### mean(), median(), stddev(), quartiles()

In [109]:
flights_df1.select(round(mean(col('DepDelay')),3).alias('meanDepDelay')).show()

+------------+
|meanDepDelay|
+------------+
|      10.619|
+------------+



In [113]:
flights_df1.select(round(stddev(col('DepDelay')),3).alias('stdDevDepDelay')).show()

+--------------+
|stdDevDepDelay|
+--------------+
|        36.198|
+--------------+



In [112]:
flights_df1.select(round(variance(col('DepDelay')),3).alias('varianceDepDelay')).show()

+----------------+
|varianceDepDelay|
+----------------+
|        1310.318|
+----------------+



In [116]:
# to calculate median, we use approxQuantile or expr
flights_df1.approxQuantile('ArrDelay', [0.5], relativeError=0.0001)[0]

-3.0

In [117]:
flights_df1.select(round(mean(col('DepDelay')),3).alias('meanDepDelay')).collect()[0][0] 
# we use collect() to get dataframe, [0][0] is for row=0 and column=0 

10.619

# 19. groupBy and aggregate

In [120]:
flights_df1.groupBy('Carrier')

<pyspark.sql.group.GroupedData at 0x7fb459652be0>

In [121]:
flights_df1.groupBy('Carrier').agg(mean('DepDelay')).show()

+-------+------------------+
|Carrier|     avg(DepDelay)|
+-------+------------------+
|     UA|12.644186783024843|
|     AA|12.154096505870111|
|     EV| 14.52813602113455|
|     B6|12.675216069471794|
|     DL| 7.451940716867138|
|     OO| 7.954327121364983|
|     F9|12.142480802645592|
|     YV| 9.595018289496604|
|     US| 5.011879623272143|
|     MQ|15.612577198673192|
|     HA|1.5358414704192993|
|     AS|0.6606730403765751|
|     FL|10.227206113831024|
|     VX|14.416962353959326|
|     WN|12.930658050935614|
|     9E| 9.767838809034908|
+-------+------------------+



In [132]:
flights_df1.groupBy('Carrier').agg(mean('ArrDelay'), mean('DepDelay')).show()

+-------+--------------------+------------------+
|Carrier|       avg(ArrDelay)|     avg(DepDelay)|
+-------+--------------------+------------------+
|     UA|   5.207155029152466|12.644186783024843|
|     AA|    7.22786703097812|12.154096505870111|
|     EV|  10.501436641191532| 14.52813602113455|
|     B6|   9.679335778153199|12.675216069471794|
|     DL|  2.8085929091567747| 7.451940716867138|
|     OO|   6.447766785619039| 7.954327121364983|
|     F9|  12.870312237233028|12.142480802645592|
|     YV|   8.749505833107245| 9.595018289496604|
|     US|   3.957719324788726| 5.011879623272143|
|     MQ|   14.27679662028746|15.612577198673192|
|     HA|   1.534325271442523|1.5358414704192993|
|     AS|-0.27272328542814955|0.6606730403765751|
|     FL|   7.277437501357486|10.227206113831024|
|     VX|   9.678802215555043|14.416962353959326|
|     WN|   8.368672739670938|12.930658050935614|
|     9E|   4.931550031523011| 9.767838809034908|
+-------+--------------------+------------------+


### 20. WithColumn
###### any column wise transformation will go through withColumn
###### for all column operations - create new or modify existing column in DataFrame
###### also used for applying transformations or calculations
###### df.withColumn(colName, col)

In [131]:
flights_df1.withColumn("TotalDelay", col("DepDelay")+col("ArrDelay")).show()

+----------+---------+-------+---------------+-------------+--------+--------+----------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|TotalDelay|
+----------+---------+-------+---------------+-------------+--------+--------+----------+
|         6|        1|     WN|          10821|        10140|       1|     -22|       -21|
|         8|        1|     AA|          11298|        10140|       0|       6|         6|
|        15|        1|     WN|          14747|        10140|      -6|       3|        -3|
|        27|        1|     AA|          11298|        10140|     113|     117|       230|
|         7|        2|     OO|          12266|        10140|      -3|     -11|       -14|
|        28|        2|     WN|          14107|        10140|      -3|       0|        -3|
|        30|        2|     OO|          12266|        10140|      -4|     -11|       -15|
|         1|        3|     EV|          12266|        10140|     -11|     -26|       -37|
|         

In [133]:
flights_df1.withColumn("IsDelay?", when(col("DepDelay")>0, 'Delay').otherwise('No Delay')).show()

+----------+---------+-------+---------------+-------------+--------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|IsDelay?|
+----------+---------+-------+---------------+-------------+--------+--------+--------+
|         6|        1|     WN|          10821|        10140|       1|     -22|   Delay|
|         8|        1|     AA|          11298|        10140|       0|       6|No Delay|
|        15|        1|     WN|          14747|        10140|      -6|       3|No Delay|
|        27|        1|     AA|          11298|        10140|     113|     117|   Delay|
|         7|        2|     OO|          12266|        10140|      -3|     -11|No Delay|
|        28|        2|     WN|          14107|        10140|      -3|       0|No Delay|
|        30|        2|     OO|          12266|        10140|      -4|     -11|No Delay|
|         1|        3|     EV|          12266|        10140|     -11|     -26|No Delay|
|         3|        3|     OO|  

In [136]:
### expr()

from pyspark.sql.functions import expr
median_expr = expr(f"percentile_approx({'ArrDelay'}, 0.5)")
flights_df.agg(median_expr.alias('Median')).collect()[0][0]

-3