# Test with scala

## Application

In [None]:
val test = (x : String): String => "truc"
test

<console>: 2: error: identifier expected but string literal found.

In [None]:
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.expressions.Window


// Functions

def crashCountPerAirplaneType(df : DataFrame) : DataFrame =
  // Compute the number of crashes for each airplane type : Zeppelin, dirigible...
  df.groupBy(col("Type")).agg(count("*").as("Total Crashes"))

def averagePassengerCount(df : DataFrame): DataFrame =
	df.agg(avg("Aboard").as("Average Passenger Count"))

def mostUsedDescriptionWords(df : DataFrame) : DataFrame = {
	// Filter out punctuation chars
	val linesDF = df.select(col("Summary")).map(text => text.getString(0).filterNot(",.;?!$".toSet))
	// Split strings into single words
	val wordsDF = linesDF.explode("value", "word")((line: String) => line.split(" "))
	// Count occurences of each word
	val wordCountDF = wordsDF.groupBy("word").count().as("count")
	val result = wordCountDF.filter(length(col("word")) > 2)
	result.orderBy(desc("count"))
}

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.0.9:4040
SparkContext available as 'sc' (version = 3.0.1, master = local[*], app id = local-1607983407004)
SparkSession available as 'spark'


import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.expressions.Window
crashCountPerAirplaneType: (df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame
averagePassengerCount: (df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame
mostUsedDescriptionWords: (df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


## Test

In [None]:

// Tests

// Arrange
//val fixtureDataset = spark.read.option("header","true").csv("s3://data-engineering-v2/M03-ETL/D03-Unit-Testing/Airplane_Crashes_and_Fatalities_Since_1908_fixture.csv")
val fixtureDataset = spark.read.option("header","true")
    .csv("/media/data-nvme/dev/src/Jedha/LEAD_PROGRAM/M03-ETL/D03-Unit_testing/01-Exercises/src/Airplane_Crashes_and_Fatalities_Since_1908_fixture.csv")

// Act
val result = crashCountPerAirplaneType(fixtureDataset)
val result2 = averagePassengerCount(fixtureDataset)
val result3 = mostUsedDescriptionWords(fixtureDataset)

// Assert
assert(result.count == 2)
assert(result.filter(col("Type") === "Dirigible").select(col("Total Crashes")).first().getLong(0) == 1)
assert(result2.count == 1)
assert(result2.select(col("Average Passenger Count")).first().getDouble(0) == 3.5)
assert(result3.select(col("word")).first().getString(0) == "flight")

println("Success !")

Success !


fixtureDataset: org.apache.spark.sql.DataFrame = [Date: string, Time: string ... 11 more fields]
result: org.apache.spark.sql.DataFrame = [Type: string, Total Crashes: bigint]
result2: org.apache.spark.sql.DataFrame = [Average Passenger Count: double]
result3: org.apache.spark.sql.DataFrame = [word: string, count: bigint]
