In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession
         .builder
         .appName("Dataframe Operations")
         .getOrCreate())


23/05/21 00:10:41 WARN Utils: Your hostname, wedivv-H110M-S2V resolves to a loopback address: 127.0.1.1; using 192.168.1.44 instead (on interface wlp5s0)
23/05/21 00:10:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/21 00:10:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
delaysPath = ("./data/7-departuredelays.csv")
airportsPath = ("./data/7-airport-codes-na.txt")

In [4]:
airports = (spark
          .read
          .option("header", True)
          .option("Delimiter", "\t")
          .csv(airportsPath))
airports.show(5)

airports.createOrReplaceTempView("airports_na")

+----------+-----+-------+----+
|      City|State|Country|IATA|
+----------+-----+-------+----+
|Abbotsford|   BC| Canada| YXX|
|  Aberdeen|   SD|    USA| ABR|
|   Abilene|   TX|    USA| ABI|
|     Akron|   OH|    USA| CAK|
|   Alamosa|   CO|    USA| ALS|
+----------+-----+-------+----+
only showing top 5 rows



In [5]:
from pyspark.sql.functions import *

In [6]:
delays = (spark
            .read
            .option("header", True)
            .csv(delaysPath)
            .withColumn("delay", expr("CAST(delay as INT) as delay"))
            .withColumn("distance", expr("CAST(distance as INT) as distance"))
            .withColumn("date", expr("date_format(from_unixtime(unix_timestamp(date, 'MMddHHmm')), 'MM-dd HH:mm') as date"))
            )
delays.show(5)

delays.createOrReplaceTempView("departureDelays")

+-----------+-----+--------+------+-----------+
|       date|delay|distance|origin|destination|
+-----------+-----+--------+------+-----------+
|01-01 12:45|    6|     602|   ABE|        ATL|
|01-02 06:00|   -8|     369|   ABE|        DTW|
|01-02 12:45|   -2|     602|   ABE|        ATL|
|01-02 06:05|   -4|     602|   ABE|        ATL|
|01-03 12:45|   -4|     602|   ABE|        ATL|
+-----------+-----+--------+------+-----------+
only showing top 5 rows



In [7]:
delays.printSchema()

root
 |-- date: string (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)



In [8]:
foo = delays.filter(
    expr("""origin == 'SEA' AND destination == 'SFO' AND
    date like '01-01 0%' AND delay > 0"""))
foo.createOrReplaceTempView("foo")

In [9]:
foo.show()



+-----------+-----+--------+------+-----------+
|       date|delay|distance|origin|destination|
+-----------+-----+--------+------+-----------+
|01-01 07:10|   31|     590|   SEA|        SFO|
|01-01 09:55|  104|     590|   SEA|        SFO|
|01-01 07:30|    5|     590|   SEA|        SFO|
+-----------+-----+--------+------+-----------+



                                                                                

#### Unions

In [10]:
bar = delays.union(foo)
bar.createOrReplaceTempView("bar")

In [11]:
spark.sql("""
    SELECT *
        FROM bar
    WHERE origin = 'SEA'
        AND destination = 'SFO'
        AND date LIKE '01-01 0%'
        AND delay > 0
""").show()

+-----------+-----+--------+------+-----------+
|       date|delay|distance|origin|destination|
+-----------+-----+--------+------+-----------+
|01-01 07:10|   31|     590|   SEA|        SFO|
|01-01 09:55|  104|     590|   SEA|        SFO|
|01-01 07:30|    5|     590|   SEA|        SFO|
|01-01 07:10|   31|     590|   SEA|        SFO|
|01-01 09:55|  104|     590|   SEA|        SFO|
|01-01 07:30|    5|     590|   SEA|        SFO|
+-----------+-----+--------+------+-----------+



#### Joins

In [12]:
foo.join(
    airports,
    airports.IATA == foo.origin
).select("City", "State", "date", "delay", "distance", "destination").show()

spark.sql("""
    SELECT a.City, a.State, f.date, f.delay, f.distance, f.destination
        FROM foo f
        JOIN airports_na a
            ON a.IATA = f.origin
""").show()

+-------+-----+-----------+-----+--------+-----------+
|   City|State|       date|delay|distance|destination|
+-------+-----+-----------+-----+--------+-----------+
|Seattle|   WA|01-01 07:10|   31|     590|        SFO|
|Seattle|   WA|01-01 09:55|  104|     590|        SFO|
|Seattle|   WA|01-01 07:30|    5|     590|        SFO|
+-------+-----+-----------+-----+--------+-----------+

+-------+-----+-----------+-----+--------+-----------+
|   City|State|       date|delay|distance|destination|
+-------+-----+-----------+-----+--------+-----------+
|Seattle|   WA|01-01 07:10|   31|     590|        SFO|
|Seattle|   WA|01-01 09:55|  104|     590|        SFO|
|Seattle|   WA|01-01 07:30|    5|     590|        SFO|
+-------+-----+-----------+-----+--------+-----------+



#### Windowing

In [13]:
dDelayWindow = spark.sql("""
    SELECT origin, destination, sum(delay) as TotalDelays
        FROM departureDelays
    WHERE origin IN ('SEA', 'SFO', 'JFK')
        AND destination IN ('SEA', 'SFO', 'JFK', 'DEN', 'ORD', 'LAX', 'ATL')
    GROUP BY origin, destination
""")

dDelayWindow.show()

dDelayWindow.createOrReplaceTempView("departureDelaysWindow")



+------+-----------+-----------+
|origin|destination|TotalDelays|
+------+-----------+-----------+
|   JFK|        ORD|       5608|
|   JFK|        SFO|      35619|
|   JFK|        DEN|       4315|
|   JFK|        ATL|      12141|
|   JFK|        SEA|       7856|
|   JFK|        LAX|      35755|
|   SEA|        LAX|       9359|
|   SFO|        ORD|      27412|
|   SFO|        DEN|      18688|
|   SFO|        SEA|      17080|
|   SEA|        SFO|      22293|
|   SFO|        ATL|       5091|
|   SEA|        DEN|      13645|
|   SEA|        ATL|       4535|
|   SEA|        ORD|      10041|
|   SFO|        JFK|      24100|
|   SFO|        LAX|      40798|
|   SEA|        JFK|       4667|
+------+-----------+-----------+



                                                                                

In [14]:
spark.sql("""
    SELECT origin, destination, TotalDelays, rank
        FROM (
            SELECT origin, destination, TotalDelays, dense_rank()
                OVER (PARTITION BY origin ORDER BY TotalDelays DESC) as rank
                FROM departureDelaysWindow
        ) t
    WHERE rank <= 3
""").show()

+------+-----------+-----------+----+
|origin|destination|TotalDelays|rank|
+------+-----------+-----------+----+
|   JFK|        LAX|      35755|   1|
|   JFK|        SFO|      35619|   2|
|   JFK|        ATL|      12141|   3|
|   SEA|        SFO|      22293|   1|
|   SEA|        DEN|      13645|   2|
|   SEA|        ORD|      10041|   3|
|   SFO|        LAX|      40798|   1|
|   SFO|        ORD|      27412|   2|
|   SFO|        JFK|      24100|   3|
+------+-----------+-----------+----+



#### Modifications

In [15]:
foo.show()

+-----------+-----+--------+------+-----------+
|       date|delay|distance|origin|destination|
+-----------+-----+--------+------+-----------+
|01-01 07:10|   31|     590|   SEA|        SFO|
|01-01 09:55|  104|     590|   SEA|        SFO|
|01-01 07:30|    5|     590|   SEA|        SFO|
+-----------+-----+--------+------+-----------+



In [16]:
foo2 = (foo.withColumn(
    "status",
    expr("CASE WHEN delay <= 10 THEN 'On-time' ELSE 'Delayed' END")
))
foo2.show()

+-----------+-----+--------+------+-----------+-------+
|       date|delay|distance|origin|destination| status|
+-----------+-----+--------+------+-----------+-------+
|01-01 07:10|   31|     590|   SEA|        SFO|Delayed|
|01-01 09:55|  104|     590|   SEA|        SFO|Delayed|
|01-01 07:30|    5|     590|   SEA|        SFO|On-time|
+-----------+-----+--------+------+-----------+-------+



In [17]:
foo3 = foo2.drop("delay")
foo3.show()


+-----------+--------+------+-----------+-------+
|       date|distance|origin|destination| status|
+-----------+--------+------+-----------+-------+
|01-01 07:10|     590|   SEA|        SFO|Delayed|
|01-01 09:55|     590|   SEA|        SFO|Delayed|
|01-01 07:30|     590|   SEA|        SFO|On-time|
+-----------+--------+------+-----------+-------+



In [18]:
foo4 = foo3.withColumnRenamed("status", "flight_status")
foo4.show()

+-----------+--------+------+-----------+-------------+
|       date|distance|origin|destination|flight_status|
+-----------+--------+------+-----------+-------------+
|01-01 07:10|     590|   SEA|        SFO|      Delayed|
|01-01 09:55|     590|   SEA|        SFO|      Delayed|
|01-01 07:30|     590|   SEA|        SFO|      On-time|
+-----------+--------+------+-----------+-------------+



#### Pivoting

In [19]:
spark.sql("""
SELECT * FROM (
    SELECT destination, CAST(SUBSTRING(date, 0, 2) AS int) AS month, delay
        FROM departureDelays WHERE origin = 'SEA'
)
PIVOT (
    CAST(AVG(delay) AS DECIMAL(4, 2)) AS AvgDelay, MAX(delay) AS MaxDelay
    FOR month IN (1 JAN, 2 FEB)
)
ORDER BY destination
""").show()

[Stage 32:>                                                         (0 + 4) / 4]

+-----------+------------+------------+------------+------------+
|destination|JAN_AvgDelay|JAN_MaxDelay|FEB_AvgDelay|FEB_MaxDelay|
+-----------+------------+------------+------------+------------+
|        ABQ|       19.86|         316|       11.42|          69|
|        ANC|        4.44|         149|        7.90|         141|
|        ATL|       11.98|         397|        7.73|         145|
|        AUS|        3.48|          50|       -0.21|          18|
|        BOS|        7.84|         110|       14.58|         152|
|        BUR|       -2.03|          56|       -1.89|          78|
|        CLE|       16.00|          27|        null|        null|
|        CLT|        2.53|          41|       12.96|         228|
|        COS|        5.32|          82|       12.18|         203|
|        CVG|       -0.50|           4|        null|        null|
|        DCA|       -1.15|          50|        0.07|          34|
|        DEN|       13.13|         425|       12.95|         625|
|        D

                                                                                