## Ejemplo M&M's Chapter 2

In [None]:

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
/**
 * Usage: MnMcount <mnm_file_dataset>
 */
// object MnMcount {
// def main(args: Array[String]) {
// A través de un SparkSession se puede crear un DataFrame o se puede registrar un DataFrame como una tabla.
 val spark = SparkSession 
 .builder
 .appName("MnMCount")
 .getOrCreate()
 //if (args.length < 1) {
// print("Usage: MnMcount <mnm_file_dataset>")
 //sys.exit(1)
 //}
 // Get the M&M data set filename
 //val mnmFile = args(0)
 // Read the file into a Spark DataFrame
 val mnmDF = spark.read.format("csv")
 .option("header", "true")
 .option("inferSchema", "true") // más util para 
 .load("C:/Users/sara.arribas/Downloads/Ejemplos_Spark/mnm_dataset.csv")
 // Aggregate counts of all colors and groupBy() State and Color
 // orderBy() in descending order
 val countMnMDF = mnmDF
 .select("State", "Color", "Count")
 .groupBy("State", "Color")
 .agg(count("Count").alias("Total"))
 .orderBy(desc("Total"))
 // Show the resulting aggregations for all the states and colors
 countMnMDF.show(60)
 println(s"Total Rows = ${countMnMDF.count()}")
 println()
 // Find the aggregate counts for California by filtering
 val caCountMnMDF = mnmDF
 .select("State", "Color", "Count")
 .where(col("State") === "CA")
 .groupBy("State", "Color")
 .agg(count("Count").alias("Total"))
 .orderBy(desc("Total"))
 // Show the resulting aggregations for California
 caCountMnMDF.show(10)
 // Stop the SparkSession
 spark.stop()


In [None]:
import org.apache.spark.sql.functions.avg
import org.apache.spark.sql.SparkSession
// Create a DataFrame using SparkSession
val spark = SparkSession
 .builder
 .appName("AuthorsAges")
 .getOrCreate()
// Create a DataFrame of names and ages
val dataDF = spark.createDataFrame(Seq(("Brooke", 20), ("Brooke", 25),
 ("Denny", 31), ("Jules", 30), ("TD", 35))).toDF("name", "age")
// Group the same names together, aggregate their ages, and compute an average
val avgDF = dataDF.groupBy("name").agg(avg("age"))
// Show the results of the final execution
avgDF.show()

 ## Ejemplo de creación de schema y DataFrame

Antes de ponernos a escribir la tabla, definimos la estructura de la misma para ahorrar trabajo a Spark y facilitarnos el trabajo a la hora de corregir errores al haber definido ya los tipos de datos.

In [None]:
import org.apache.spark.sql.types._
val schema = StructType(Array(StructField("author", StringType, false),
 StructField("title", StringType, false),
 StructField("pages", IntegerType, false)))

// lo mismo pero usando DDL (Data Definition Language) parece mucho más simple:
// val schema = "author STRING, title STRING, pages INT"

In [None]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._

 val spark = SparkSession
 .builder
 .appName("Example-3_7")
 .getOrCreate()
/* if (args.length <= 0) {
 println("usage Example3_7 <file path to blogs.json>")
 System.exit(1)
 }
  */
 // Get the path to the JSON file
 val jsonFile = "C:/Users/sara.arribas/Downloads/Ejemplos_Spark/blogs.json"

 // Define our schema programmatically
 val schema = StructType(Array(StructField("Id", IntegerType, false),
 StructField("First", StringType, false),
 StructField("Last", StringType, false),
 StructField("Url", StringType, false),
 StructField("Published", StringType, false),
 StructField("Hits", IntegerType, false),
 StructField("Campaigns", ArrayType(StringType), false)))
 // Create a DataFrame by reading from the JSON file 
 // with a predefined schema
 val blogsDF = spark.read.schema(schema).json(jsonFile)
 // Show the DataFrame schema as output
 blogsDF.show(false)

 // Print the schema
 println(blogsDF.printSchema)
 println(blogsDF.schema)


## Columns and Expressions

In [None]:
import org.apache.spark.sql.functions._
blogsDF.columns

// Access a particular column with col and it returns a Column type
blogsDF.col("Id")

// Use an expression to compute a value
blogsDF.select(expr("Hits * 2")).show(2)
// or use col to compute value
blogsDF.select(col("Hits") * 2).show(2)

In [None]:
// Use an expression to compute big hitters for blogs
// This adds a new column, Big Hitters, based on the conditional expression
blogsDF.withColumn("Big Hitters", (expr("Hits > 10000"))).show()

In [None]:
// Concatenate three columns, create a new column, and show the
// newly created concatenated column
blogsDF
 .withColumn("AuthorsId", (concat(expr("First"), expr("Last"), expr("Id"))))
 .select(col("AuthorsId"))
 .show(4)

In [None]:
// These statements return the same value, showing that
// expr is the same as a col method call
blogsDF.select(expr("Hits")).show(2)
blogsDF.select(col("Hits")).show(2)
blogsDF.select("Hits").show(2)


In [None]:
// Sort by column "Id" in descending order
blogsDF.sort(col("Id").desc).show()
blogsDF.sort($"Id".desc).show()

// $ before the name of the column is a function in Spark that converts column named Id to a Column.


## Row

A row in Spark is a generic Row object, containing one or more columns. Each col‐
umn may be of the same data type (e.g., integer or string), or they can have different
types (integer, string, map, array, etc.). Because Row is an object in Spark and an
ordered collection of fields, you can instantiate a Row in each of Spark’s supported lan‐
guages and access its fields by an index starting at 0

In [None]:
import org.apache.spark.sql.Row
// Create a Row
val blogRow = Row(6, "Reynold", "Xin", "https://tinyurl.6", 255568, "3/2/2015",
 Array("twitter", "LinkedIn"))
// Access using index for individual items
blogRow(1) 

Row objects can be used to create DataFrames if you need them for quick interactivity
and exploration:


In [None]:
val rows = Seq(("Matei Zaharia", "CA"), ("Reynold Xin", "CA"))
val authorsDF = rows.toDF("Author", "State")
authorsDF.show()

## Using DataFrameReader and DataFrameWriter

In [None]:
val fireSchema = StructType(Array(StructField("CallNumber", IntegerType, true),
                                  StructField("UnitID", StringType, true),
                                  StructField("IncidentNumber", IntegerType, true),
                                  StructField("CallType", StringType, true),
                                  StructField("CallDate", StringType, true),
                                  StructField("WatchDate", StringType, true),
                                  StructField("CallFinalDisposition", StringType, true),
                                  StructField("AvailableDtTm", StringType, true),
                                  StructField("Address", StringType, true),
                                  StructField("City", StringType, true),
                                  StructField("Zipcode", IntegerType, true),
                                  StructField("Battalion", StringType, true),
                                  StructField("StationArea", StringType, true),
                                  StructField("Box", StringType, true),
                                  StructField("OriginalPriority", StringType, true),
                                  StructField("Priority", StringType, true),
                                  StructField("FinalPriority", IntegerType, true),
                                  StructField("ALSUnit", BooleanType, true),
                                  StructField("CallTypeGroup", StringType, true),
                                  StructField("NumAlarms", IntegerType, true),
                                  StructField("UnitType", StringType, true),
                                  StructField("UnitSequenceInCallDispatch", IntegerType, true),
                                  StructField("FirePreventionDistrict", StringType, true),
                                  StructField("SupervisorDistrict", StringType, true),
                                  StructField("Neighborhood", StringType, true),
                                  StructField("Location", StringType, true),
                                  StructField("RowID", StringType, true),
                                  StructField("Delay", FloatType, true)))
// Read the file using the CSV DataFrameReader
val sfFireFile = "C:/Users/sara.arribas/Downloads/Ejemplos_Spark/sf-fire-calls.csv"
val fireDF = spark.read.schema(fireSchema)
 .option("header", "true")
 .csv(sfFireFile)


## Saving a DataFrame as a Parquet file or SQL table

In [None]:
// In Scala to save as a Parquet file
val parquetPath = "C:/Users/sara.arribas/Downloads/Ejemplos_Spark/parquet/fire_df2.parquet"
fireDF.write.format("parquet").save(parquetPath)

In [None]:
val parquetTable = "fireTable" // name of the table
fireDF.write.format("parquet").saveAsTable(parquetTable)

## Projections and filters

A projection in relational parlance is a way to return only the rows matching a certain relational condition by using filters. In Spark, projections are done with the select() method, while filters can be expressed using the filter() or
where() method. We can use this technique to examine specific aspects of our SF Fire Department data set:

In [None]:
val fewFireDF = fireDF
 .select("IncidentNumber", "AvailableDtTm", "CallType")
 .where($"CallType" =!= "Medical Incident")
fewFireDF.show(5, false)


What if we want to know how many distinct CallTypes were recorded as the causes
of the fire calls?

In [None]:
import org.apache.spark.sql.functions._
fireDF
 .select("CallType")
 .where(col("CallType").isNotNull)
 .agg(countDistinct('CallType) as 'DistinctCallTypes)
 .show()

In [None]:
fireDF
 .select("CallType")
 .where($"CallType".isNotNull)
 .distinct()
 .show(10, false)

## Renaming, adding, and dropping columns

You could selectively rename columns with the withColumnRenamed() method. For instance, let’s change the name of our Delay column to ResponseDelayedinMins 

In [None]:
val newFireDF = fireDF.withColumnRenamed("Delay", "ResponseDelayedinMins")
newFireDF
 .select("ResponseDelayedinMins")
 .where($"ResponseDelayedinMins" > 5)
 .show(5, false)

1. Convert the existing column’s data type from string to a Spark-supported timestamp.

2. Use the new format specified in the format string "MM/dd/yyyy" or "MM/dd/yyyy hh:mm:ss a" where appropriate.

3. After converting to the new data type, drop() the old column and append the new one specified in the first argument to the withColumn() method.

4. Assign the new modified DataFrame to fire_ts_df.

The queries result in three new columns

In [None]:
val fireTsDF = newFireDF
 .withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy"))
 .drop("CallDate")
 .withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
 .drop("WatchDate")
 .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"),
 "MM/dd/yyyy hh:mm:ss a"))
 .drop("AvailableDtTm")
// Select the converted columns
fireTsDF
 .select("IncidentDate", "OnWatchDate", "AvailableDtTS")
 .show(5, false)

Now that we have modified the dates, we can query using functions from
spark.sql.functions like month(), year(), and day()

In [None]:
fireTsDF
 .select(year($"IncidentDate"))
 .distinct()
 .orderBy(year($"IncidentDate"))
 .show()

## Aggregations

In [None]:
fireTsDF
 .select("CallType")
 .where(col("CallType").isNotNull)
 .groupBy("CallType")
 .count()
 .orderBy(desc("count"))
 .show(10, false)

## Other common DataFrame operations

In [None]:
import org.apache.spark.sql.{functions => F}
fireTsDF
 .select(F.sum("NumAlarms"), F.avg("ResponseDelayedinMins"),
 F.min("ResponseDelayedinMins"), F.max("ResponseDelayedinMins"))
 .show()

For more advanced statistical needs common with data science workloads, read the API documentation for methods like stat(), describe(), correlation(), covariance(), sampleBy(), approxQuantile(), frequentItems(), and so on.

## End-to-End DataFrame Example : Fire Calls

**1) What were all the different types of fire calls in 2018?**

In [None]:
fireDF.select("CallType").where(col("CallType").isNotNull).distinct().show(10, false)

In [None]:
fireDF.select("CallType").distinct().count()

**2) What months within the year 2018 saw the highest number of fire calls?**

In [None]:
import org.apache.spark.sql.{functions => F}
fireTsDF.filter(year($"IncidentDate") === 2018).groupBy(month($"IncidentDate")).count().orderBy(desc("count")).show()

**3) Which neighborhood in San Francisco generated the most fire calls in 2018?**

Este código me saca todos los vecindarios agrupados por el número de llamadas  

In [None]:
fireTsDF.filter(year($"IncidentDate") === 2018).groupBy("Neighborhood").count().orderBy(desc("count")).show()

Para quedarme sólo con una bastaría con:

In [None]:
fireTsDF.filter(year($"IncidentDate") === 2018).groupBy("Neighborhood").count().orderBy(desc("count")).show(1)

**Which neighborhoods had the worst response times to fire calls in 2018?**

In [None]:
fireTsDF.select("Neighborhood","ResponseDelayedinMins").filter(year($"IncidentDate") === 2018)
.orderBy(desc("ResponseDelayedinMins")).show(10, false)

**Is there a correlation between neighborhood, zip code, and number of fire calls?**

In [None]:
fireTsDF.select("Neighborhood","ZipCode").groupBy("Neighborhood","ZipCode").count()
.orderBy(desc("count")).show(10, false)

## Typed Objects, Untyped Objects, and Generic Rows

In [None]:
import org.apache.spark.sql.Row
val row = Row(350, true, "Learning Spark 2E", null)


In [None]:
row.getInt(0)

In [None]:
row.getBoolean(1)

In [None]:
row.getString(2)

## Creating Datasets

In [None]:
case class DeviceIoTData (battery_level: Long, c02_level: Long,
cca2: String, cca3: String, cn: String, device_id: Long,
device_name: String, humidity: Long, ip: String, latitude: Double,
lcd: String, longitude: Double, scale:String, temp: Long,
timestamp: Long)

In [None]:
val ds = spark.read
.json("C:/Users/sara.arribas/Downloads/Ejemplos_Spark")
.as[DeviceIoTData]

In [None]:
ds.show(5, false)

## Dataset Operations

In [None]:
val filterTempDS = ds.filter(ds("temp") > 30 && ds("humidity") > 70)
filterTempDS.show(5, false)

In [None]:
case class DeviceTempByCountry(temp: Long, device_name: String, device_id: Long,
 cca3: String)

In [None]:
val dsTemp = ds.filter(ds("temp") > 25)
 .select("temp","device_name","device_id","cca3")
 .toDF("temp", "device_name", "device_id", "cca3")
 .as[DeviceTempByCountry]
dsTemp.show(5, false)

In [None]:
val device = dsTemp.first()
println(device)

## End-to-End Dataset Example

**1). Detect failing devices with battery levels below a threshold.**

In [None]:
ds.select($"battery_level",$"c02_level",$"device_name").where($"battery_level" < 8).sort($"c02_level").show(5, false)

**2. Identify offending countries with high levels of CO2 emissions.**

In [None]:
val newDS = ds.select("cn","c02_level").filter(ds("c02_level")>1300).groupBy($"cn").avg().sort($"avg(c02_level)".desc)
newDS.show(10, false)

**3. Compute the min and max values for temperature, battery level, CO2, and humidity.**

In [None]:
ds.select(min("temp"),max("temp"),min("humidity"),max("humidity"),min("c02_level"),max("c02_level"),min("battery_level"), 
          max("battery_level")).show()

**4. Sort and group by average temperature, CO2, humidity, and country**

In [None]:
ds.select("temp", "c02_level", "humidity", "cn").groupBy($"cn").avg()
.sort($"avg(temp)".desc,$"avg(c02_level)".desc).as("avg(humidity)").show(10, false)