#### Transforming Data Using Apache Spark

In [None]:
%scala
// RDD Transformation examples

// Let us take a sample list of cities and perform transformations on it.
val cities = Seq("New York",
  "New Jersey",
  "San Francisco",
  "Phoenix",
  "Seattle",
  "Austin",
  "Atlanta",
  "Miami",
  "Salt Lake City",
  "Tempe",
  "San Jose",
  "Chicago",
  "San Jose",
  "Miami",
  " ",
  "Austin")

// Creat the RDD
val rdd=spark.sparkContext.parallelize(cities)

// Map transformation
val maprdd=rdd.map( f => (f,1))
maprdd.collect.foreach(println)

//FlatMap transformation
val fmrdd = rdd.flatMap(word => word.split(" "))
fmrdd.collect.foreach(println)

//Filter transformation
val filterrdd = rdd.filter(word => word.contains(" "))
filterrdd.collect.foreach(println)

// Filtering out empty entries
val emptystrfilterrdd = rdd.filter(_.nonEmpty)
emptystrfilterrdd.collect.foreach(println)

// groupby transformation
val groupbyrdd = rdd.groupBy(word => word.charAt(0))
groupbyrdd.collect.foreach(println)

//Union transformation
val rdd1 = spark.sparkContext.parallelize(List(1, 2, 3))
val rdd2 = spark.sparkContext.parallelize(List(4, 5, 6))

val unionrdd = rdd1.union(rdd2)
unionrdd.collect().foreach(println)

//Distinct transformation
val distrinctrdd = rdd.distinct()
distrinctrdd.collect.foreach(println)


In [None]:
%scala
// Dataframe Transformation examples

import org.apache.spark.sql.{DataFrame, Row, SaveMode}
import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}

// Let us define a sample Dataframe
val driverDetails = Seq(
    Row("Alice","","Hood","100","New York", "Female", 4100),
    Row("Bryan","M","Williams","101","New York","Male", 4000),
    Row("Catherine","Goodwin","","102","California","Female", 4300),
    Row("Daryl","","Jones","103","Florida","Male", 5500),
    Row("Jenny","Anne","Simons","104","Arizona","Female", 3400),
    Row("Daryl","","Jones","103","Florida","Male", 5500)
  )

// Define the schema
val driverSchema = new StructType().add("firstName", StringType).add("middleName", StringType).add("lastName",StringType).add("id",StringType).add("location",StringType).add("gender",StringType).add("salary",IntegerType)

// Create the Dataframe
val driverDf = spark.createDataFrame(
    spark.sparkContext.parallelize(driverDetails),driverSchema)
  driverDf.printSchema()
  driverDf.show(false)

// select transformation
driverDf.select("firstname","lastname").show()

// filter tranformation
driverDf.filter('location === "Florida").show(false)

// distinct transformation
driverDf.distinct().show(false)

// Sortby
driverDf.sort("lastname","firstname").show(false)

// Orderby
driverDf.orderBy("location").show(false)

//groupby transformation
driverDf.groupBy("location").avg("salary").show(false)

// Join
// For the join, let us create one more datafram called driverRating

val driverRating = Seq(
    Row("100", 5),
    Row("101", 4),
    Row("102", 3),
    Row("103", 5),
    Row("104", 2),
    Row("103",4)
  )

// Define the schema
val ratingSchema = new StructType().add("id",StringType).add("rating",IntegerType)

// Create the Dataframe
val ratingDf = spark.createDataFrame(
    spark.sparkContext.parallelize(driverRating),ratingSchema)
  ratingDf.printSchema()
  ratingDf.show(false)

driverDf.join(ratingDf, driverDf("id") ===  ratingDf("id"),"inner").show(false)
