## Group 8 Assignment Phase 3 : Multilayer Perceptron - Peter

* Do the followings in HDFS:

* Remove any folder/files in /tmp that starts with flightData_,

* Create folder /tmp/flightData_in/,

* Put the parquet dataset file into /tmp/flightData_in/,

* Make sure put was successfull (it should have the same size as the local file)!

In [13]:
! hadoop fs -chmod -R 777 hdfs://localhost:9000/tmp
! hadoop fs -rm    -r  hdfs://localhost:9000/tmp/flightData_*
! hadoop fs -mkdir -p  hdfs://localhost:9000/tmp/flightData_in
! hadoop fs -put   -p  flightDelay.parquet hdfs://localhost:9000/tmp/flightData_in
! hadoop fs -ls        hdfs://localhost:9000/tmp/flightData_in/

20/06/08 10:23:35 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.


Deleted hdfs://localhost:9000/tmp/flightData_in


Found 1 items


drwxrwxr-x   - root root          0 2020-05-24 03:38 hdfs://localhost:9000/tmp/flightData_in/flightDelay.parquet




In [14]:
!hdfs getconf -confKey fs.defaultFS

hdfs://localhost:9000



## Load Requisite Libraries and Start a Spark Session

In [15]:
//Start a simple Spark Session
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._

//Feature Processing Classes
import org.apache.spark.ml.feature.{VectorAssembler,StringIndexer,VectorIndexer,OneHotEncoder, PCA}

//Linear Algebra Data Structures
import org.apache.spark.ml.linalg.{Vector,Vectors}

//Model Building Pipeline
import org.apache.spark.ml.{Pipeline, PipelineStage, PipelineModel}

//Binary Classification
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel,
                                           RandomForestClassifier, GBTClassifier,
                                           DecisionTreeClassifier, DecisionTreeClassificationModel}
//Model Training
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, 
                                   ParamGridBuilder, TrainValidationSplit}

//Model Evaluation
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator,MulticlassClassificationEvaluator}

//Neural Network
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
import org.apache.spark.ml.feature.IndexToString
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel

//Optional: Use the following code below to set the Error reporting
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

val spark = SparkSession.builder().appName("Group 8 ML Phase 3").getOrCreate()

import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, VectorIndexer, OneHotEncoder, PCA}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.{Pipeline, PipelineStage, PipelineModel}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel, RandomForestClassifier, GBTClassifier, DecisionTreeClassifier, DecisionTreeClassificationModel}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Multic...

In [16]:
def getConfusionMatrix(predictionDF: DataFrame): Unit = {
    
    println("========================Model Assessment Metrics==================================================\n")
    // Define Binary Classification Evaluator
    val binaryEval = new BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("rawPrediction")
    // Run Evaluation.  The area under the ROC curve ranges from 0.5 and 1.0 with larger values indicative of better fit
    println(s"Area under ROC: ${binaryEval.setMetricName("areaUnderROC").evaluate(predictionDF)}")
    // Define Multiclass Classification Evaluator
    val multiEval = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction")
    println(s"Accuracy: ${multiEval.setMetricName("accuracy").evaluate(predictionDF)}")
    println(s"Weighted Precision: ${multiEval.setMetricName("weightedPrecision").evaluate(predictionDF)}")
    println(s"Weighted Recall: ${multiEval.setMetricName("weightedRecall").evaluate(predictionDF)}")
    println(s"F1: ${multiEval.setMetricName("f1").evaluate(predictionDF)}")

    val TP = predictionDF.select("label", "prediction").filter("label = 1 and prediction = 1").count
    val TN = predictionDF.select("label", "prediction").filter("label = 0 and prediction = 0").count
    val FP = predictionDF.select("label", "prediction").filter("label = 0 and prediction = 1").count
    val FN = predictionDF.select("label", "prediction").filter("label = 1 and prediction = 0").count
    val total = predictionDF.select("label").count.toDouble
    // Unweighted Metrics
    val accuracy    = (TP + TN) / total
    val precision   = TP / (TP+FP).toDouble
    val recall      = TP / (TP+FN).toDouble
    val F1 = 2*precision*recall/(precision+recall)
    println(s"Accuracy: ${accuracy}")
    println(s"Precision: ${precision}")
    println(s"Recall: ${recall}")
    println(s"F1: ${F1}")

     // Confusion matrix
    printf(s"""|=================== Confusion Matrix ==========================
           |##########| %-15s                     %-15s
           |----------+----------------------------------------------------
           |Actual = 0| %-15d                     %-15d
           |Actual = 1| %-15d                     %-15d
           |===============================================================
         """.stripMargin, "Predicted = 0", "Predicted = 1", TN, FP, FN, TP)

    println("\n==================================================================================================")
}


getConfusionMatrix: (predictionDF: org.apache.spark.sql.DataFrame)Unit


## Read in a parquet file of flight delay, fuel-price and meteorological data

In [17]:
val flights = (spark
            .read.parquet("hdfs://localhost:9000/tmp/flightData_in/flightDelay.parquet")
            .withColumn("Month_Num1", $"Month_Num" cast "Int")
            //convert month and year to integer index starting Jan 2004
            .withColumn("Date_Num",  ($"Year"-2004)*12 + $"Month_Num1")
            .drop("Sectors_Flown", "Month_Num1", "Change")
            .withColumnRenamed("Departures_Delayed","label")
            .withColumnRenamed("Price","Fuel_Price")
            //drop NA's even though none were found!
            .na.drop()
              )

flights.printSchema()

root
 |-- Departing_Port: string (nullable = true)
 |-- Arriving_Port: string (nullable = true)
 |-- Airline: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month_Num: string (nullable = true)
 |-- Fuel_Price: double (nullable = true)
 |-- Departing_Port_station_ID: string (nullable = true)
 |-- Departing_Port_station_name: string (nullable = true)
 |-- Arriving_Port_station_ID: string (nullable = true)
 |-- Arriving_Port_station_name: string (nullable = true)
 |-- Mean_3pm_cloud_cover_oktas_Depart: double (nullable = true)
 |-- Mean_3pm_dew_point_temperature_Degrees_C_Depart: double (nullable = true)
 |-- Mean_3pm_relative_humidity_%_Depart: double (nullable = true)
 |-- Mean_3pm_temperature_Degrees_C_Depart: double (nullable = true)
 |-- Mean_3pm_wet_bulb_temperature_Degrees_C_Depart: double (nullable = true)
 |-- Mean_3pm_wind_speed_km/h_Depart: double (nullable = true)
 |-- Mean_9am_cloud_cover_okas_Depart: double (nullable 

flights: org.apache.spark.sql.DataFrame = [Departing_Port: string, Arriving_Port: string ... 72 more fields]


## Take a look at the proportion of lates in the dataset

In [18]:
val counts = flights.groupBy("label").count()

println("proportion of lates (label=1) in the sample")
counts.show()

proportion of lates (label=1) in the sample
+-----+-------+
|label|  count|
+-----+-------+
|    1|1072071|
|    0|5224826|
+-----+-------+



counts: org.apache.spark.sql.DataFrame = [label: int, count: bigint]


## Split The Data into training and testing dataframes

In [19]:
//Filter out the most recent 12 months of flight data as the test dataset
//Dates after March 2019 have Date_Num > 183
val rawTesting = flights.filter($"Date_Num"> 183).cache()
println(s"Test Set of the Most Recent 12 Months has ${rawTesting.count()} records")

val sample = 0.1
val testing = rawTesting.sample(false, sample)  
println(s"Sampled training set count: ${testing.count()}")

//Filter out rows prior to the most recent 12 months of flight data as the training dataset
val rawTraining = flights.filter($"Date_Num" < 184)



Test Set of the Most Recent 12 Months has 435479 records
Sampled training set count: 43802


rawTesting: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
sample: Double = 0.1
testing: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
rawTraining: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]


## Down sample the Ontime Departures To Balance The Training data


In [20]:
val ontimeTrainingFlights = rawTraining.filter($"label"===0)
println(s"On time Training Flights: ${ontimeTrainingFlights.count()}")

val delayedTrainingFlights = rawTraining.filter($"label"===1)
println(s"Delayed Training Flights: ${delayedTrainingFlights.count()}")

//ontime:delayed approx 5:1 so take a random sample of size fifth of the ontime departures
val downSampleFraction = 0.2
val sampledOntimeTrainingFlights = ontimeTrainingFlights.sample(false, downSampleFraction)  

println(s"Down Sampled ontime Training Flights: ${sampledOntimeTrainingFlights.count()}")

//down sample resulting training set for the purposes of local testing
val localTestingSampleFraction = 0.1
//Concatenate rows of ontimeTrainingFlights and delayedTrainingFlights
val training = (sampledOntimeTrainingFlights
                .union(delayedTrainingFlights)
                .sample(false, localTestingSampleFraction)
                .cache())
               
val resampledCounts = training.groupBy("label").count()
println("proportion of lates (label=1) in the sample")
resampledCounts.show()

On time Training Flights: 4884963
Delayed Training Flights: 976455
Down Sampled ontime Training Flights: 977965
proportion of lates (label=1) in the sample
+-----+-----+
|label|count|
+-----+-----+
|    1|97614|
|    0|98162|
+-----+-----+



ontimeTrainingFlights: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
delayedTrainingFlights: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
downSampleFraction: Double = 0.2
sampledOntimeTrainingFlights: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
localTestingSampleFraction: Double = 0.1
training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
resampledCounts: org.apache.spark.sql.DataFrame = [label: int, count: bigint]


## Set up Flight Data Feature Processing Pipleline Stages 

In [21]:
//////////////////////////////////////////////////
//// Setting Up DataFrame for Machine Learning ///
//////////////////////////////////////////////////

// Deal with Categorical Columns
val categoricalVariables = Array(
    "Departing_Port", "Arriving_Port", "Airline")
val categoricalIndexers = categoricalVariables
  .map(i => new StringIndexer().setInputCol(i).setOutputCol(i+"_Index"))
val categoricalEncoders = categoricalVariables
  .map(e => new OneHotEncoder().setInputCol(e + "_Index").setOutputCol(e + "_Vec"))


// columns that need to be added to the features vector
val cols = Array("Date_Num",  "Airline_Vec", "Fuel_Price",
    "Departing_Port_Vec", 
        "Mean_daily_wind_run_km_Depart", "Mean_rainfall_mm_Depart",
        "Mean_number_of_days_of_rain_Depart","Mean_number_of_days_>_40_Degrees_C_Depart",
    "Arriving_Port_Vec")

// Assemble everything together to be ("label","features") format
val assembler = (new VectorAssembler()
                 .setInputCols(cols)
                 .setOutputCol("indexedFeatures") )

// principal component analysis - set equal to number of features in final version
val pca = new PCA().setInputCol("indexedFeatures").setOutputCol("features").setK(9)

// specify layers for the neural network:
// input layer of size 9 (features), two intermediate of size equal to input layer
// and output of size 2 (labels)
val layers = Array[Int](9,9,9,2)

val mlpc = new MultilayerPerceptronClassifier()
  .setLayers(layers)
  .setLabelCol("label")
  .setFeaturesCol("features")
  .setBlockSize(128)
  .setSeed(12345)
  .setMaxIter(100)

//////////////////////////////////////////////
//   Define and construct the ML Pipeline  ///
//////////////////////////////////////////////

val stages: Array[PipelineStage] = categoricalIndexers ++ categoricalEncoders ++ Array(assembler,pca, mlpc)

// build the pipeline
val pipeline = new Pipeline().setStages(stages)


categoricalVariables: Array[String] = Array(Departing_Port, Arriving_Port, Airline)
categoricalIndexers: Array[org.apache.spark.ml.feature.StringIndexer] = Array(strIdx_09ea1d8efff1, strIdx_5585acef6b38, strIdx_f6a89eb787b4)
categoricalEncoders: Array[org.apache.spark.ml.feature.OneHotEncoder] = Array(oneHot_817becda355c, oneHot_820441c94175, oneHot_8c0942701269)
cols: Array[String] = Array(Date_Num, Airline_Vec, Fuel_Price, Departing_Port_Vec, Mean_daily_wind_run_km_Depart, Mean_rainfall_mm_Depart, Mean_number_of_days_of_rain_Depart, Mean_number_of_days_>_40_Degrees_C_Depart, Arriving_Port_Vec)
assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_28f30fd8fd0d
pca: org.apache.spark.ml.feature.PCA = pca_23f095542e7f
layers: Array[Int] = Array(9, 9, 9, 2)
mlpc: org.apache...

## Train neural network model

In [22]:
val mlpc_model = pipeline.fit(training)

mlpc_model: org.apache.spark.ml.PipelineModel = pipeline_946f3b0b9db0


## Test neural network model

In [23]:
// Make predictions.
val mlpc_predictions = mlpc_model.transform(testing)



mlpc_predictions: org.apache.spark.sql.DataFrame = [Departing_Port: string, Arriving_Port: string ... 83 more fields]


## Evaluate model with confusion matrix

In [24]:
getConfusionMatrix(mlpc_predictions)


Area under ROC: 0.5156893802653647
Accuracy: 0.7708095520752477
Weighted Precision: 0.6713791956691585
Weighted Recall: 0.7708095520752477
F1: 0.6892269623372006
Accuracy: 0.7708095520752477
Precision: 0.2813852813852814
Recall: 0.026984950700570835
F1: 0.04924708779240458
##########| Predicted = 0                       Predicted = 1  
----------+----------------------------------------------------
Actual = 0| 33503                               664            
Actual = 1| 9375                                260            
         


## Store model on hdfs

In [25]:
! hadoop fs -rm    -r  hdfs://localhost:9000/tmp/flightDelayModel_*
! hadoop fs -mkdir -p  hdfs://localhost:9000/tmp/flightDelayModel__out

20/06/08 10:27:58 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.


Deleted hdfs://localhost:9000/tmp/flightDelayModel__out




In [26]:
//Persist the Model to the hdfs
mlpc_model
    .write
    .overwrite()
    .save("hdfs://localhost:9000/tmp/flightDelayModel__out")