## Group 8 Assignment Phase 3 : Gradient Boosted Decision Tree - Wasnik

. Do the followings in HDFS <br>
. Remove any folder/files in /tmp that starts with flightData_ <br>
. Put the parquet dataset file into /tmp/flightData_in/ <br>
. Make sure put was successful (it should have the same size as the lcoal file)! <br>


In [1]:
! hadoop fs -chmod -R 777 hdfs://localhost:9000/tmp
! hadoop fs -rm    -r  hdfs://localhost:9000/tmp/flightData_*
! hadoop fs -mkdir -p  hdfs://localhost:9000/tmp/flightData_in
! hadoop fs -put   -p  flightDelay.parquet hdfs://localhost:9000/tmp/flightData_in
! hadoop fs -ls        hdfs://localhost:9000/tmp/flightData_in/

20/06/08 10:10:45 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.


Deleted hdfs://localhost:9000/tmp/flightData_in


Found 1 items


drwxrwxr-x   - root root          0 2020-05-24 03:38 hdfs://localhost:9000/tmp/flightData_in/flightDelay.parquet




In [2]:
!hdfs getconf -confKey fs.defaultFS

hdfs://localhost:9000



### Start a Spark session

In [None]:
sc

### Load required libraries

In [3]:
//Start a simple Spark Session
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._

//Feature Processing Classes
import org.apache.spark.ml.feature.{VectorAssembler,StringIndexer,VectorIndexer,OneHotEncoder, PCA}

//standard scaler Classes
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.ml.feature.StandardScaler


//Linear Algebra Data Structures
import org.apache.spark.ml.linalg.{Vector,Vectors}

//Model Building Pipeline
import org.apache.spark.ml.{Pipeline, PipelineStage, PipelineModel}

//Binary Classification
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel,
                                           RandomForestClassifier, GBTClassifier,
                                           DecisionTreeClassifier, DecisionTreeClassificationModel}
//Model Training
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, 
                                   ParamGridBuilder, TrainValidationSplit}

//Model Evaluation
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator,MulticlassClassificationEvaluator}

//Optional: Use the following code below to set the Error reporting
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)


// GBT libraries

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}


val spark = SparkSession.builder().appName("Group 8 ML Phase 3").getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://8c79e9d02429:4041
SparkContext available as 'sc' (version = 2.4.5, master = local[*], app id = local-1591611063888)
SparkSession available as 'spark'


import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, VectorIndexer, OneHotEncoder, PCA}
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.{Pipeline, PipelineStage, PipelineModel}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel, RandomForestClassifier, GBTClassifier, DecisionTreeClassifier, DecisionTreeClassificationModel}
import org.apache.spark.ml.param.ParamMap
import org.apache.s...

### Read in a parquet file of flight delay, fuel-price and meteorological data


Loading parque files and tailored dataframes as required

In [4]:
// Take a random sample (without replacement) of the data (to reduce memory requirements)
val sampleFraction = 0.2

val flights = (spark
            .read.parquet("flightDelay.parquet")
            .withColumn("Month_Num1", $"Month_Num" cast "Int")
            //convert month and year to integer index starting Jan 2004
            .withColumn("Date_Num",  ($"Year"-2004)*12 + $"Month_Num1")
            .drop("Sectors_Flown", "Month_Num1", "Change")
            .withColumnRenamed("Departures_Delayed","label")
            .withColumnRenamed("Price","Fuel_Price")
            //take a sample without replacement
            .sample(false,sampleFraction, seed = 222)
            //drop NA's even though none were found!
            .na.drop()
            //.cache
              )

flights.printSchema()

root
 |-- Departing_Port: string (nullable = true)
 |-- Arriving_Port: string (nullable = true)
 |-- Airline: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month_Num: string (nullable = true)
 |-- Fuel_Price: double (nullable = true)
 |-- Departing_Port_station_ID: string (nullable = true)
 |-- Departing_Port_station_name: string (nullable = true)
 |-- Arriving_Port_station_ID: string (nullable = true)
 |-- Arriving_Port_station_name: string (nullable = true)
 |-- Mean_3pm_cloud_cover_oktas_Depart: double (nullable = true)
 |-- Mean_3pm_dew_point_temperature_Degrees_C_Depart: double (nullable = true)
 |-- Mean_3pm_relative_humidity_%_Depart: double (nullable = true)
 |-- Mean_3pm_temperature_Degrees_C_Depart: double (nullable = true)
 |-- Mean_3pm_wet_bulb_temperature_Degrees_C_Depart: double (nullable = true)
 |-- Mean_3pm_wind_speed_km/h_Depart: double (nullable = true)
 |-- Mean_9am_cloud_cover_okas_Depart: double (nullable 

sampleFraction: Double = 0.2
flights: org.apache.spark.sql.DataFrame = [Departing_Port: string, Arriving_Port: string ... 72 more fields]


### Preparing for ML

### 1. Creating training and test set with down sample the Ontime Departures To Balance The Training and Test data

Balancing test set and sampled

In [5]:
//Filter out the most recent 12 months of flight data as the test dataset
//Dates after March 2019 have Date_Num > 183
val testing = flights.filter($"Date_Num"> 183)
println(s"Test Set of the Most Recent 12 Months has ${testing.count()} records")

val ontimeTestFlights = testing.filter($"label" === 0)
println(s"On time Test Flights: ${ontimeTestFlights.count()}")

val delayedTestFlights = testing.filter($"label" === 1)
println(s"Delayed Test FLights: ${delayedTestFlights.count()}")

//val sampledOntimeTestFlights = ontimeTestFlights.sample(false, 0.2)

//val sampleFraction = 0.1

val test = (ontimeTestFlights.union(delayedTestFlights))

//val sampledCounts = test.groupBy("label").count()
//println("proportion of lates (label=1) in the sample")
//sampledCounts.show()

Test Set of the Most Recent 12 Months has 87165 records
On time Test Flights: 67957
Delayed Test FLights: 19208


testing: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
ontimeTestFlights: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
delayedTestFlights: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]


Balancing training set and sampled

In [6]:
//Filter out rows prior to the most recent 12 months of flight data as the training dataset
val rawTraining = flights.filter($"Date_Num" < 184)

val ontimeTrainingFlights = rawTraining.filter($"label"===0)
println(s"On time Training Flights: ${ontimeTrainingFlights.count()}")

val delayedTrainingFlights = rawTraining.filter($"label"===1)
println(s"Delayed Training Flights: ${delayedTrainingFlights.count()}")

val sampledOntimeTrainingFlights = ontimeTrainingFlights.sample(false, 0.2)  

println(s"Down Sampled ontime Training Flights: ${sampledOntimeTrainingFlights.count()}")

//Concatenate rows of ontimeTrainingFlights and delayedTrainingFlights
val train = (sampledOntimeTrainingFlights.union(delayedTrainingFlights))
               
//val resampledCounts = training.groupBy("label").count()
//println("proportion of lates (label=1) in the sample")
//resampledCounts.show()
//training.count()

On time Training Flights: 976756
Delayed Training Flights: 194421
Down Sampled ontime Training Flights: 195532


rawTraining: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
ontimeTrainingFlights: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
delayedTrainingFlights: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
sampledOntimeTrainingFlights: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
train: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]


### 2. Creating Confusion Matrix

In [7]:

def getConfusionMatrix(predictionDF: DataFrame): Unit = {
    
    println("========================Model Assessment Metrics==================================================\n")
    // Define Binary Classification Evaluator
    val binaryEval = new BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("rawPrediction")
    // Run Evaluation.  The area under the ROC curve ranges from 0.5 and 1.0 with larger values indicative of better fit
    println(s"Area under ROC: ${binaryEval.setMetricName("areaUnderROC").evaluate(predictionDF)}")
    // Define Multiclass Classification Evaluator
    val multiEval = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction")
    println(s"Accuracy: ${multiEval.setMetricName("accuracy").evaluate(predictionDF)}")
    println(s"Weighted Precision: ${multiEval.setMetricName("weightedPrecision").evaluate(predictionDF)}")
    println(s"Weighted Recall: ${multiEval.setMetricName("weightedRecall").evaluate(predictionDF)}")
    println(s"F1: ${multiEval.setMetricName("f1").evaluate(predictionDF)}")

    val TP = predictionDF.select("label", "prediction").filter("label = 1 and prediction = 1").count
    val TN = predictionDF.select("label", "prediction").filter("label = 0 and prediction = 0").count
    val FP = predictionDF.select("label", "prediction").filter("label = 0 and prediction = 1").count
    val FN = predictionDF.select("label", "prediction").filter("label = 1 and prediction = 0").count
    val total = predictionDF.select("label").count.toDouble
    // Unweighted Metrics
    val accuracy    = (TP + TN) / total
    val precision   = TP / (TP+FP).toDouble
    val recall      = TP / (TP+FN).toDouble
    val F1 = 2*precision*recall/(precision+recall)
    println(s"Accuracy: ${accuracy}")
    println(s"Precision: ${precision}")
    println(s"Recall: ${recall}")
    println(s"F1: ${F1}")

    //predictionDF.select( $"label",$"prediction" cast "Int").orderBy("label").groupBy("label").pivot("prediction",Seq("0","1")).count.show()

    // Confusion matrix
    printf(s"""|=================== Confusion Matrix ==========================
           |##########| %-15s                     %-15s
           |----------+----------------------------------------------------
           |Actual = 0| %-15d                     %-15d
           |Actual = 1| %-15d                     %-15d
           |===============================================================
         """.stripMargin, "Predicted = 0", "Predicted = 1", TN, FP, FN, TP)

    println("\n==================================================================================================")
}

getConfusionMatrix: (predictionDF: org.apache.spark.sql.DataFrame)Unit


### 3. Preparing feature vectors for ML pipeline

##### Pre processing continuos features : Extracting Weather columns along with Date_Num

In [8]:
// listing columns and extracting as requried
val array = Array("Departing_Port", "Arriving_Port", "Airline", "label", "Year", "Month_Num", "Fuel_Price", 
                  "Departing_Port_station_ID", "Departing_Port_station_name", "Arriving_Port_station_ID", 
                  "Arriving_Port_station_name")
val columns = flights.columns

val feature_cols = columns.filter(!array.contains(_))


// Using VectorAssembler to create single vector feature out of feature_cols

val weather_assembler = new VectorAssembler()
                        .setInputCols(feature_cols)
                        .setOutputCol("weather_features")

val weathered_DF = weather_assembler.transform(flights)

//weathered_DF.printSchema
val weather_features_df = weathered_DF.select("weather_features")
weather_features_df.show(5)

+--------------------+
|    weather_features|
+--------------------+
|[3.9,7.4,43.0,21....|
|[5.8,23.1,69.0,29...|
|[5.8,23.1,69.0,29...|
|[5.8,23.1,69.0,29...|
|[3.9,7.4,43.0,21....|
+--------------------+
only showing top 5 rows



array: Array[String] = Array(Departing_Port, Arriving_Port, Airline, label, Year, Month_Num, Fuel_Price, Departing_Port_station_ID, Departing_Port_station_name, Arriving_Port_station_ID, Arriving_Port_station_name)
columns: Array[String] = Array(Departing_Port, Arriving_Port, Airline, label, Year, Month_Num, Fuel_Price, Departing_Port_station_ID, Departing_Port_station_name, Arriving_Port_station_ID, Arriving_Port_station_name, Mean_3pm_cloud_cover_oktas_Depart, Mean_3pm_dew_point_temperature_Degrees_C_Depart, Mean_3pm_relative_humidity_%_Depart, Mean_3pm_temperature_Degrees_C_Depart, Mean_3pm_wet_bulb_temperature_Degrees_C_Depart, Mean_3pm_wind_speed_km/h_Depart, Mean_9am_cloud_cover_okas_Depart, Mean_9am_dew_point_temperature_Degrees_C_Depart, Mean_9am_relative_humidity_%_Depart, Mean...

In [9]:
feature_cols

res2: Array[String] = Array(Mean_3pm_cloud_cover_oktas_Depart, Mean_3pm_dew_point_temperature_Degrees_C_Depart, Mean_3pm_relative_humidity_%_Depart, Mean_3pm_temperature_Degrees_C_Depart, Mean_3pm_wet_bulb_temperature_Degrees_C_Depart, Mean_3pm_wind_speed_km/h_Depart, Mean_9am_cloud_cover_okas_Depart, Mean_9am_dew_point_temperature_Degrees_C_Depart, Mean_9am_relative_humidity_%_Depart, Mean_9am_temperature_Degrees_C_Depart, Mean_9am_wet_bulb_temperature_Degrees_C_Depart, Mean_9am_wind_speed_km/h_Depart, Mean_daily_evaporation_mm_Depart, Mean_daily_ground_minimum_temperature_Degrees_C_Depart, Mean_daily_solar_exposure_MJ/m*m_Depart, Mean_daily_sunshine_hours_Depart, Mean_daily_wind_run_km_Depart, Mean_maximum_temperature_Degrees_C_Depart, Mean_minimum_temperature_Degrees_C_Depart, Mean_n...

##### Standardridzing weather_features_df using StandardScalar such that output feature can be used for PCA to reduce dimensionality of weather features

In [10]:
import org.apache.spark.ml.feature.StandardScaler

// creating standardarized features
val std_scalar = new StandardScaler() 
                .setInputCol("weather_features")
                .setOutputCol("standard_features")
                .setWithStd(true)
                .setWithMean(false)

// implementing to PCA
val pca = new PCA()
          .setInputCol("standard_features")
          .setOutputCol("continuos_features")
          .setK(14) 

import org.apache.spark.ml.feature.StandardScaler
std_scalar: org.apache.spark.ml.feature.StandardScaler = stdScal_a421f36905bd
pca: org.apache.spark.ml.feature.PCA = pca_51eb3c198604


##### Pre-processing Categorical features : usign VectorAssembler to create a single vector feature

In [11]:
// Deal with Categorical Columns
val categoricalVariables = Array(
    "Departing_Port", "Arriving_Port", "Airline")
val categoricalIndexers = categoricalVariables
  .map(i => new StringIndexer().setInputCol(i).setOutputCol(i+"_Index"))
val categoricalEncoders = categoricalVariables
  .map(e => new OneHotEncoder().setInputCol(e + "_Index").setOutputCol(e + "_Vec"))


// selecting only vectorized explanatory categorical fields
val explanatoryFields = Array("Airline_Vec", "Fuel_Price","Departing_Port_Vec","Arriving_Port_Vec")

// Assemble everything together to be ("label","features") format
val categorical_assembler = new VectorAssembler()
                 .setInputCols(explanatoryFields)
                 //.setOutputCol("indexedFeatures")
                 .setOutputCol("categorical_features")


categoricalVariables: Array[String] = Array(Departing_Port, Arriving_Port, Airline)
categoricalIndexers: Array[org.apache.spark.ml.feature.StringIndexer] = Array(strIdx_23c18795f8ca, strIdx_1512bce97506, strIdx_252fd093f5b3)
categoricalEncoders: Array[org.apache.spark.ml.feature.OneHotEncoder] = Array(oneHot_1991c27cd51e, oneHot_96229569f44c, oneHot_2c6aa07560b7)
explanatoryFields: Array[String] = Array(Airline_Vec, Fuel_Price, Departing_Port_Vec, Arriving_Port_Vec)
categorical_assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_8362291e5029


##### Using VectorAssembler again to finally bind categorical and continuous features as features 

In [12]:
// creating final features

val assembler = new VectorAssembler()
                .setInputCols(Array("continuos_features", "categorical_features"))
                .setOutputCol("features")


assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_0d2732528944


### Building stages for Pipeline

In [13]:
// establishing stages of pipeling

val featureProcessingStages: Array[PipelineStage] = Array(weather_assembler, std_scalar, pca) ++ categoricalIndexers ++ 
                             categoricalEncoders ++ Array(categorical_assembler) ++ Array(assembler) 

featureProcessingStages: Array[org.apache.spark.ml.PipelineStage] = Array(vecAssembler_2213e5398914, stdScal_a421f36905bd, pca_51eb3c198604, strIdx_23c18795f8ca, strIdx_1512bce97506, strIdx_252fd093f5b3, oneHot_1991c27cd51e, oneHot_96229569f44c, oneHot_2c6aa07560b7, vecAssembler_8362291e5029, vecAssembler_0d2732528944)


### 4.  Estimator Components

In [14]:
// creating gradient boosted tree classifier
val gbt = new GBTClassifier()
         .setLabelCol("label")
         .setFeaturesCol("features")
         .setMaxIter(10)
         .setMaxBins(10)
         .setFeatureSubsetStrategy("auto")

// assembling a pipeline
val pipeline = new Pipeline()
              .setStages(featureProcessingStages ++ Array(gbt))



gbt: org.apache.spark.ml.classification.GBTClassifier = gbtc_4699d751885c
pipeline: org.apache.spark.ml.Pipeline = pipeline_139cc9016693


### 5. Fitting model into data

In [17]:
// fitting pipeline to training set
val model = pipeline.fit(train)

model: org.apache.spark.ml.PipelineModel = pipeline_139cc9016693


In [18]:
// transforming test into model
val prediction = model.transform(test)

prediction: org.apache.spark.sql.DataFrame = [Departing_Port: string, Arriving_Port: string ... 86 more fields]


### 6. Model Assessment

In [19]:
// calling a function for model assessment

getConfusionMatrix(prediction)


Area under ROC: 0.5942017544573059
Accuracy: 0.4623874261458154
Weighted Precision: 0.7062940886620209
Weighted Recall: 0.4623874261458154
F1: 0.49474766315920726
Accuracy: 0.4623874261458154
Precision: 0.2515944736889384
Recall: 0.7290712203248646
F1: 0.3740934165008214
##########| Predicted = 0                       Predicted = 1  
----------+----------------------------------------------------
Actual = 0| 26300                               41657          
Actual = 1| 5204                                14004          
         
