# Spark with Scala


Here is a list of warm-up exercices

------------------

### Getting the Spark session & the Spark context

In [None]:
import org.apache.spark.sql.SparkSession

val spark = SparkSession
  .builder()
  .appName("Spark SQL basic example")
  .config("spark.some.config.option", "some-value")
  .getOrCreate()

val sc = spark.sparkContext

import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@1c5551a
sc: org.apache.spark.SparkContext = org.apache.spark.SparkContext@6492b417


### Loading the data

In [None]:
// TODO: load the file located in the project /data/Airplane_Crashes_and_Fatalities_Since_1908.csv and show the dataset
val airplaneCrashed = spark.read.option("header","true").csv("./data/Airplane_Crashes_and_Fatalities_Since_1908.csv")
airplaneCrashed.show

+----------+-----+--------------------+--------------------+--------+-------------+--------------------+------------+-----+------+----------+------+--------------------+
|      Date| Time|            Location|            Operator|Flight #|        Route|                Type|Registration|cn/In|Aboard|Fatalities|Ground|             Summary|
+----------+-----+--------------------+--------------------+--------+-------------+--------------------+------------+-----+------+----------+------+--------------------+
|09/17/1908|17:18| Fort Myer, Virginia|Military - U.S. Army|    null|Demonstration|    Wright Flyer III|        null|    1|     2|         1|     0|During a demonstr...|
|07/12/1912|06:30|AtlantiCity, New ...|Military - U.S. Navy|    null|  Test flight|           Dirigible|        null| null|     5|         5|     0|First U.S. dirigi...|
|08/06/1913| null|Victoria, British...|             Private|       -|         null|    Curtiss seaplane|        null| null|     1|         1|     0|Th

airplaneCrashed: org.apache.spark.sql.DataFrame = [Date: string, Time: string ... 11 more fields]


### Basic statistics

In [None]:
// TODO: count the number of airplanes that crashed 
airplaneCrashed.count

res8: Long = 5268


In [None]:
// TODO: count the number of crashes, and the number of Fatalities by operator
airplaneCrashed.groupBy(col("Type")).agg(count("Fatalities").as("Total Crashes"), avg("Fatalities").as("Fatalities")).show

+--------------------+-------------+------------------+
|                Type|Total Crashes|        Fatalities|
+--------------------+-------------+------------------+
|      Dornier Merkur|            3|3.6666666666666665|
|  De Havilland DH.80|            1|               3.0|
|        Junkers W-33|            1|               2.0|
|        Douglas C-47|           61|17.147540983606557|
|   Consolidated B-24|            2|              17.0|
|Lockheed L-649 Co...|            1|               1.0|
|Lockheed L-749A-7...|            1|               1.0|
|Embraer 110C Band...|            9| 9.777777777777779|
|      Boeing 727-286|            1|               7.0|
|Vickers Viscount ...|            1|              27.0|
|Fokker Universal ...|            1|               2.0|
|Ford 5-AT-C Tri-M...|            2|               4.0|
|    de Havilland 110|            1|               1.0|
|Bristol 170 Freig...|            4|             15.75|
|     Boeing B-727-64|            3|            

In [None]:
// TODO: filter the dataset to only have military planes
airplaneCrashed.filter(col("Operator").contains("Military"))

+----------+-----+--------------------+--------------------+--------+-------------+--------------------+------------+-----+------+----------+------+--------------------+
|      Date| Time|            Location|            Operator|Flight #|        Route|                Type|Registration|cn/In|Aboard|Fatalities|Ground|             Summary|
+----------+-----+--------------------+--------------------+--------+-------------+--------------------+------------+-----+------+----------+------+--------------------+
|09/17/1908|17:18| Fort Myer, Virginia|Military - U.S. Army|    null|Demonstration|    Wright Flyer III|        null|    1|     2|         1|     0|During a demonstr...|
|07/12/1912|06:30|AtlantiCity, New ...|Military - U.S. Navy|    null|  Test flight|           Dirigible|        null| null|     5|         5|     0|First U.S. dirigi...|
|09/09/1913|18:30|  Over the North Sea|Military - German...|    null|         null|Zeppelin L-1 (air...|        null| null|    20|        14|     0|Th

In [None]:
// Convert the date and the time into a timestamp object
// Hint, convert a string to a timestamp
// import org.apache.spark.sql.functions.to_timestamp
// to_timestamp(someTimeString, "dd-MM-yyyy HH:mm")
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter.ofPattern
import org.apache.spark.sql.functions.to_timestamp

val airplaneCrashedWithDateTime = airplaneCrashed.withColumn("Datetime", to_timestamp(concat($"Date", lit(" "), $"Time"), "MM/dd/yyyy H:mm" ))
airplaneCrashedWithDateTime.show

+----------+-----+--------------------+--------------------+--------+-------------+--------------------+------------+-----+------+----------+------+--------------------+-------------------+
|      Date| Time|            Location|            Operator|Flight #|        Route|                Type|Registration|cn/In|Aboard|Fatalities|Ground|             Summary|           Datetime|
+----------+-----+--------------------+--------------------+--------+-------------+--------------------+------------+-----+------+----------+------+--------------------+-------------------+
|09/17/1908|17:18| Fort Myer, Virginia|Military - U.S. Army|    null|Demonstration|    Wright Flyer III|        null|    1|     2|         1|     0|During a demonstr...|1908-09-17 17:18:00|
|07/12/1912|06:30|AtlantiCity, New ...|Military - U.S. Navy|    null|  Test flight|           Dirigible|        null| null|     5|         5|     0|First U.S. dirigi...|1912-07-12 06:30:00|
|08/06/1913| null|Victoria, British...|           

import java.time.LocalDateTime
import java.time.format.DateTimeFormatter.ofPattern
import org.apache.spark.sql.functions.to_timestamp
airplaneCrashedWithDateTime: org.apache.spark.sql.DataFrame = [Date: string, Time: string ... 12 more fields]


In [None]:
// Fin the hour of the day with the most crashes
// Tips
// to get the hour out of a date column: hour(col("Datetime"))
// - you can drop columns containing NA: df.na.drop()
airplaneCrashedWithDateTime.withColumn("Hour of Day", hour(col("Datetime"))).groupBy(col("Hour of Day")).count().na.drop().orderBy(desc("count")).show()

+-----------+-----+
|Hour of Day|count|
+-----------+-----+
|          9|  183|
|         19|  181|
|         15|  171|
|         11|  171|
|         10|  167|
|         14|  166|
|         12|  165|
|         13|  157|
|         16|  156|
|         17|  155|
|         20|  154|
|         18|  146|
|          8|  141|
|          7|  132|
|         23|  111|
|         21|  108|
|         22|  105|
|          6|   85|
|          2|   79|
|          1|   66|
+-----------+-----+
only showing top 20 rows

