## Group 8 Assignment Phase 3 Logistic Regression with PCA for Weather and CV (Tod)

* Do the followings in HDFS:

* Remove any folder/files in /tmp that starts with flightData_,

* Create folder /tmp/flightData_in/,

* Put the parquet dataset file into /tmp/flightData_in/,

* Make sure put was successfull (it should have the same size as the local file)!

In [1]:
! hadoop fs -chmod -R 777 hdfs://localhost:9000/tmp
! hadoop fs -rm    -r  hdfs://localhost:9000/tmp/flightData_*
! hadoop fs -mkdir -p  hdfs://localhost:9000/tmp/flightData_in
! hadoop fs -put   -p  flightDelay.parquet hdfs://localhost:9000/tmp/flightData_in
! hadoop fs -ls        hdfs://localhost:9000/tmp/flightData_in/

rm: `hdfs://localhost:9000/tmp/flightData_*': No such file or directory


Found 1 items


drwxrwxr-x   - root root          0 2020-05-24 03:38 hdfs://localhost:9000/tmp/flightData_in/flightDelay.parquet




In [2]:
!hdfs getconf -confKey fs.defaultFS

hdfs://localhost:9000



## Load Requisite Libraries and Start a Spark Session

In [3]:
//Start a simple Spark Session
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._

import org.apache.spark.ml.attribute._

//Feature pre-Processing Classes
import org.apache.spark.ml.feature.{VectorAssembler,StringIndexer,StandardScaler,
                                    VectorIndexer,OneHotEncoder, PCA, Normalizer}


//Linear Algebra Data Structures
import org.apache.spark.ml.linalg.{Vector,Vectors}

//Model Building Pipeline
import org.apache.spark.ml.{Pipeline, PipelineStage, PipelineModel}

//Binary Classification
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel, BinaryLogisticRegressionSummary}
//Model Training
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}

//Model Evaluation
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator,MulticlassClassificationEvaluator}

//Optional: Use the following code below to set the Error reporting
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

val spark = SparkSession.builder().appName("Group 8 ML Phase 3").getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://8c79e9d02429:4040
SparkContext available as 'sc' (version = 2.4.5, master = local[*], app id = local-1591610792470)
SparkSession available as 'spark'


import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.ml.attribute._
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, StandardScaler, VectorIndexer, OneHotEncoder, PCA, Normalizer}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.{Pipeline, PipelineStage, PipelineModel}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel, BinaryLogisticRegressionSummary}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}
import org.apache.spark.ml.evaluation.{BinaryCl...

## Read in a parquet file of flight delay, fuel-price and meteorological data

In [4]:
val rawFlights = (spark
            .read.parquet("flightDelay.parquet") //Delete in production mode
            //.read.parquet("hdfs://localhost:9000/tmp/flightData_in/flightDelay.parquet")
            .withColumn("Month_Num1", $"Month_Num" cast "Int")
            //convert month and year to integer index starting Jan 2004
            .withColumn("Date_Num",  ($"Year"-2004)*12 + $"Month_Num1")
            .drop("Sectors_Flown", "Month_Num1", "Change")
            .withColumnRenamed("Departures_Delayed","label")
            .withColumnRenamed("Price","Fuel_Price")
            .sample(false, 0.2) //delete this in production mode
            //.na.drop()
            //.cache
              )

//rawFlights.printSchema()

rawFlights: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]


## index rows by volume of traffic, routeID = 1, has the least traffic

In [5]:
val routes = rawFlights
    .groupBy("Departing_Port","Arriving_Port")
    .agg(expr("count(*) as numFlights"))
    .orderBy(desc("numFlights"))
    .withColumn("routeID",row_number().over(Window.orderBy("numFlights")))

val flights = rawFlights.join(
    routes, 
    rawFlights(("Departing_Port")) <=> routes(("Departing_Port"))
        && rawFlights(("Arriving_Port")) <=> routes(("Arriving_Port")),
    "left"
).drop(rawFlights.col("Departing_Port")).drop(rawFlights.col("Arriving_Port"))


//flights.show(2,false)

flights.select("Date_Num",  "Departing_Port", "Arriving_Port", "Airline", "routeID", "numFlights", "Fuel_Price","Mean_daily_wind_run_km_Depart", "Mean_rainfall_mm_Depart",
    "Mean_number_of_days_of_rain_Depart","Mean_number_of_days_>_40_Degrees_C_Depart").show(5,false)

+--------+--------------+-------------+----------------+-------+----------+----------+-----------------------------+-----------------------+----------------------------------+-----------------------------------------+
|Date_Num|Departing_Port|Arriving_Port|Airline         |routeID|numFlights|Fuel_Price|Mean_daily_wind_run_km_Depart|Mean_rainfall_mm_Depart|Mean_number_of_days_of_rain_Depart|Mean_number_of_days_>_40_Degrees_C_Depart|
+--------+--------------+-------------+----------------+-------+----------+----------+-----------------------------+-----------------------+----------------------------------+-----------------------------------------+
|1       |Cairns        |Sydney       |Qantas          |114    |10109     |1.3       |323.0                        |399.6                  |18.5                              |0.0                                      |
|64      |Wagga Wagga   |Sydney       |Regional Express|103    |9073      |1.92      |241.0                        |40.4        

routes: org.apache.spark.sql.DataFrame = [Departing_Port: string, Arriving_Port: string ... 2 more fields]
flights: org.apache.spark.sql.DataFrame = [Airline: string, label: int ... 74 more fields]


## Split the data into training and testing dataframes

In [6]:
//Filter out the most recent 12 months of flight data as the test dataset
//Take a sample of Dates after March 2019 have Date_Num > 183 for the testing data
val testing = flights.filter($"Date_Num"> 183).cache()
println(s"Test Set of the Most Recent 12 Months has ${testing.count()} records")

//Filter out rows prior to the most recent 12 months of flight data as the training dataset
val rawTraining = flights.filter($"Date_Num" < 184)

Test Set of the Most Recent 12 Months has 87268 records


testing: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Airline: string, label: int ... 74 more fields]
rawTraining: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Airline: string, label: int ... 74 more fields]


## Down sample the Ontime Departures To Balance The Training data

In [7]:
val ontimeTrainingFlights = rawTraining.filter($"label"===0)
println(s"On time Training Flights: ${ontimeTrainingFlights.count()}")

val delayedTrainingFlights = rawTraining.filter($"label"===1)
println(s"Delayed Training Flights: ${delayedTrainingFlights.count()}")

//ontime:delayed approx 5:1 so take a random sample of size fifth of the ontime departures
val downSampleFraction = 0.2
val sampledOntimeTrainingFlights = ontimeTrainingFlights.sample(false, downSampleFraction)  

println(s"Down Sampled ontime Training Flights: ${sampledOntimeTrainingFlights.count()}")

//down sample resulting training set for the purposes of local testing
val localTestingSampleFraction = 0.1
//Concatenate rows of ontimeTrainingFlights and delayedTrainingFlights
val training = (sampledOntimeTrainingFlights
                .union(delayedTrainingFlights)
                .sample(false, localTestingSampleFraction)
                .cache())
               
val resampledCounts = training.groupBy("label").count()
println("proportion of lates (label=1) in the sample")
resampledCounts.show()

On time Training Flights: 975678
Delayed Training Flights: 195349
Down Sampled ontime Training Flights: 195630
proportion of lates (label=1) in the sample
+-----+-----+
|label|count|
+-----+-----+
|    1|19464|
|    0|19381|
+-----+-----+



ontimeTrainingFlights: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Airline: string, label: int ... 74 more fields]
delayedTrainingFlights: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Airline: string, label: int ... 74 more fields]
downSampleFraction: Double = 0.2
sampledOntimeTrainingFlights: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Airline: string, label: int ... 74 more fields]
localTestingSampleFraction: Double = 0.1
training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Airline: string, label: int ... 74 more fields]
resampledCounts: org.apache.spark.sql.DataFrame = [label: int, count: bigint]


## Contruct a Confusion Matrix for Model Assessment

In [8]:
def getConfusionMatrix(predictionDF: DataFrame): Unit = {
    
    println("========================Model Assessment Metrics==================================================\n")
    // Define Binary Classification Evaluator
    val binaryEval = new BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("rawPrediction")
    // Run Evaluation.  The area under the ROC curve ranges from 0.5 and 1.0 with larger values indicative of better fit
    println(s"Area under ROC: ${binaryEval.setMetricName("areaUnderROC").evaluate(predictionDF)}")
    // Define Multiclass Classification Evaluator
    val multiEval = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction")
    println(s"Accuracy: ${multiEval.setMetricName("accuracy").evaluate(predictionDF)}")
    println(s"Weighted Precision: ${multiEval.setMetricName("weightedPrecision").evaluate(predictionDF)}")
    println(s"Weighted Recall: ${multiEval.setMetricName("weightedRecall").evaluate(predictionDF)}")
    println(s"F1: ${multiEval.setMetricName("f1").evaluate(predictionDF)}")

    val TP = predictionDF.select("label", "prediction").filter("label = 1 and prediction = 1").count
    val TN = predictionDF.select("label", "prediction").filter("label = 0 and prediction = 0").count
    val FP = predictionDF.select("label", "prediction").filter("label = 0 and prediction = 1").count
    val FN = predictionDF.select("label", "prediction").filter("label = 1 and prediction = 0").count
    val total = predictionDF.select("label").count.toDouble
    // Unweighted Metrics
    val accuracy    = (TP + TN) / total
    val precision   = TP / (TP+FP).toDouble
    val recall      = TP / (TP+FN).toDouble
    val F1 = 2*precision*recall/(precision+recall)
    println(s"Accuracy: ${accuracy}")
    println(s"Precision: ${precision}")
    println(s"Recall: ${recall}")
    println(s"F1: ${F1}")

    //predictionDF.select( $"label",$"prediction" cast "Int").orderBy("label").groupBy("label").pivot("prediction",Seq("0","1")).count.show()

    // Confusion matrix
    printf(s"""|=================== Confusion Matrix ==========================
           |##########| %-15s                     %-15s
           |----------+----------------------------------------------------
           |Actual = 0| %-15d                     %-15d
           |Actual = 1| %-15d                     %-15d
           |===============================================================
         """.stripMargin, "Predicted = 0", "Predicted = 1", TN, FP, FN, TP)

    println("\n==================================================================================================")
}


getConfusionMatrix: (predictionDF: org.apache.spark.sql.DataFrame)Unit


## Conduct a PCA dimension reduction on weather columns

In [10]:
// listing non-weather related columns and extracting as requried
val nonWeather = Array("Departing_Port", "Arriving_Port", "Airline", "routeID","Date_Num", "label", "Year", "Month_Num", "Fuel_Price", 
                  "Departing_Port_station_ID", "Departing_Port_station_name", "Arriving_Port_station_ID", 
                  "Arriving_Port_station_name")
val columns = flights.columns

val weatherCols = columns.filter(!nonWeather.contains(_))


// Using VectorAssembler to create single vector feature out of feature_cols

val weather_assembler = new VectorAssembler()
                        .setInputCols(weatherCols)
                        .setOutputCol("weather_features")

val weathered_DF = weather_assembler.transform(flights)

//weathered_DF.printSchema
val weather_features_df = weathered_DF.select("weather_features")
weather_features_df.show(5)

// creating standardarized features
val std_scalar = new StandardScaler() 
                .setInputCol("weather_features")
                .setOutputCol("standard_features")
                .setWithStd(true)
                .setWithMean(false)

// implementing to PCA
val pca = new PCA()
          .setInputCol("standard_features")
          .setOutputCol("continuosWeatherFeatures")
          .setK(1) 

+--------------------+
|    weather_features|
+--------------------+
|[5.3,22.6,66.0,29...|
|[3.9,7.4,43.0,21....|
|[5.3,22.6,66.0,29...|
|[5.8,23.1,69.0,29...|
|[3.9,7.4,43.0,21....|
+--------------------+
only showing top 5 rows



nonWeather: Array[String] = Array(Departing_Port, Arriving_Port, Airline, routeID, Date_Num, label, Year, Month_Num, Fuel_Price, Departing_Port_station_ID, Departing_Port_station_name, Arriving_Port_station_ID, Arriving_Port_station_name)
columns: Array[String] = Array(Airline, label, Year, Month_Num, Fuel_Price, Departing_Port_station_ID, Departing_Port_station_name, Arriving_Port_station_ID, Arriving_Port_station_name, Mean_3pm_cloud_cover_oktas_Depart, Mean_3pm_dew_point_temperature_Degrees_C_Depart, Mean_3pm_relative_humidity_%_Depart, Mean_3pm_temperature_Degrees_C_Depart, Mean_3pm_wet_bulb_temperature_Degrees_C_Depart, Mean_3pm_wind_speed_km/h_Depart, Mean_9am_cloud_cover_okas_Depart, Mean_9am_dew_point_temperature_Degrees_C_Depart, Mean_9am_relative_humidity_%_Depart, Mean_9am_te...

## Set up Flight Data Feature Processing Pipleline Stages

In [11]:
// Deal with Categorical Columns
val categoricalVariables = Array("Airline")
val categoricalIndexers = categoricalVariables
  .map(i => new StringIndexer().setInputCol(i).setOutputCol(i+"_Index"))
val categoricalEncoders = categoricalVariables
  .map(e => new OneHotEncoder().setInputCol(e + "_Index").setOutputCol(e + "_Vec"))


// selecting non weather related features
val nonWeatherExplanatoryFields = Array("Airline_Vec", "Fuel_Price", "routeID", "Date_Num")

// Assemble everything together to be ("label","features") format
val categorical_assembler = new VectorAssembler()
                 .setInputCols(nonWeatherExplanatoryFields)
                 //.setOutputCol("indexedFeatures")
                 .setOutputCol("nonWeatherFeatureVectors")


// creating final features

val assembler = new VectorAssembler()
                .setInputCols(Array("continuosWeatherFeatures", "nonWeatherFeatureVectors"))
                .setOutputCol("features")

///////////////////////////////////////////////////////////////////////////
//   Define Feature Preprocessing Stages suitable for all candidate models  ///
///////////////////////////////////////////////////////////////////////////
val featureProcessingStages: Array[PipelineStage] = Array(weather_assembler, std_scalar, pca) ++ categoricalIndexers++categoricalEncoders++Array(categorical_assembler,assembler) 

categoricalVariables: Array[String] = Array(Airline)
categoricalIndexers: Array[org.apache.spark.ml.feature.StringIndexer] = Array(strIdx_f5c387684d1a)
categoricalEncoders: Array[org.apache.spark.ml.feature.OneHotEncoder] = Array(oneHot_51df60eaa705)
nonWeatherExplanatoryFields: Array[String] = Array(Airline_Vec, Fuel_Price, routeID, Date_Num)
categorical_assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_ed97c79dffa1
assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_b42c6bb8597f
featureProcessingStages: Array[org.apache.spark.ml.PipelineStage] = Array(vecAssembler_831770cfd00d, stdScal_0f2b457f131a, pca_fc5c281591b0, strIdx_f5c387684d1a, oneHot_51df60eaa705, vecAssembler_ed97c79dffa1, vecAssembler_b42c6bb8597f)


## Setup a Cross Validated Logistic Regression Pipeline

In [12]:
// Define the Logistic Regression Estimator.
val lr = new LogisticRegression()
        .setFeaturesCol(assembler.getOutputCol)
        //.setFeaturesCol(pca.getOutputCol)
        .setLabelCol("label")

// Print out the parameters, documentation, and any default values.
//println(s"LogisticRegression parameters:\n ${lr.explainParams()}\n")

// We use a ParamGridBuilder to construct a grid of parameters to search over.
val lrParamGrid = new ParamGridBuilder()
  .addGrid(lr.regParam, Array(0.01))
  .addGrid(lr.threshold, (for (i <- 53 to 53) yield i.toDouble / 100).toArray)
  .addGrid(lr.tol, Array(0.000001))
  .addGrid(lr.elasticNetParam, Array(0.0))
  .build()

// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
// Note that the evaluator here is a BinaryClassificationEvaluator and its default metric is areaUnderROC.
val cv = new CrossValidator()
  .setEstimator(new Pipeline().setStages(featureProcessingStages ++ Array(lr)))
  .setEvaluator(new BinaryClassificationEvaluator)
  .setEstimatorParamMaps(lrParamGrid)
  .setNumFolds(10)  // Use 3+ in practice
  //.setParallelism(2)  // Evaluate up to 2 parameter settings in parallel

lr: org.apache.spark.ml.classification.LogisticRegression = logreg_382506d1ca79
lrParamGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	logreg_382506d1ca79-elasticNetParam: 0.0,
	logreg_382506d1ca79-regParam: 0.01,
	logreg_382506d1ca79-threshold: 0.53,
	logreg_382506d1ca79-tol: 1.0E-6
})
cv: org.apache.spark.ml.tuning.CrossValidator = cv_4c614d61b6da


## Train the Logistic Regression Model using Cross Validation Tuning for Hyperparameters

In [13]:
// Run cross-validation, and choose the best set of parameters.
val pipelineModel = cv.fit(training)


val bestModel = pipelineModel.bestModel match {
  case pm: PipelineModel => Some(pm)
  case _ => None
}

val ml = bestModel
    .map(_.stages.collect { case ml: LogisticRegressionModel => ml })
    .flatMap(_.headOption)

// Get fitted logistic regression model
val lrModel = ml.get.asInstanceOf[LogisticRegressionModel]

//Get Coeffs of the Best Logistic Regression Model
//println(s"Intercept: ${lrModel.intercept}")
//println(s"Coefficients: ${lrModel.coefficients}")
println(s"ElasticNetParam: ${lrModel.getElasticNetParam}")
println(s"Threshold: ${lrModel.getThreshold}")


val lrModelSummary = ml.get.summary.asInstanceOf[BinaryLogisticRegressionSummary]
println(s"areaUnderCurve: ${lrModelSummary.areaUnderROC}")
val fMeasure = lrModelSummary.fMeasureByThreshold
val maxFMeasure = fMeasure.agg("F-Measure" -> "Max").head().getDouble(0)
val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure).select("threshold").head().getDouble(0)
println(s"MaxFMeasure: $maxFMeasure & bestThreshold: $bestThreshold")


2020-06-08 10:08:46,268 WARN  [Executor task launch worker for task 4676] netlib.BLAS (BLAS.java:<clinit>(61)) - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2020-06-08 10:08:46,269 WARN  [Executor task launch worker for task 4676] netlib.BLAS (BLAS.java:<clinit>(61)) - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
2020-06-08 10:08:46,858 WARN  [Thread-4] netlib.LAPACK (LAPACK.java:<clinit>(61)) - Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
2020-06-08 10:08:46,858 WARN  [Thread-4] netlib.LAPACK (LAPACK.java:<clinit>(61)) - Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK
ElasticNetParam: 0.0
Threshold: 0.53
areaUnderCurve: 0.5747764894063114
MaxFMeasure: 0.6676809765037431 & bestThreshold: 0.38238365779767464


pipelineModel: org.apache.spark.ml.tuning.CrossValidatorModel = cv_4c614d61b6da
bestModel: Option[org.apache.spark.ml.PipelineModel] = Some(pipeline_d8f7fd8f2a6a)
ml: Option[org.apache.spark.ml.classification.LogisticRegressionModel] = Some(LogisticRegressionModel: uid = logreg_382506d1ca79, numClasses = 2, numFeatures = 13)
lrModel: org.apache.spark.ml.classification.LogisticRegressionModel = LogisticRegressionModel: uid = logreg_382506d1ca79, numClasses = 2, numFeatures = 13
lrModelSummary: org.apache.spark.ml.classification.BinaryLogisticRegressionSummary = org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummaryImpl@4f297075
fMeasure: org.apache.spark.sql.DataFrame = [threshold: double, F-Measure: double]
maxFMeasure: Double = 0.6676809765037431
bestThreshold: Doub...

## Test the Logistic Regression Pipeline and Report on the performance

In [14]:
//Test the model
val lrPredictions = pipelineModel.transform(testing)

// Get output schema of our fitted pipeline
val schema = lrPredictions.schema
// Extract the attributes of the input (features) column to our logistic regression model

val featureAttrs = AttributeGroup.fromStructField(schema(lrModel.getFeaturesCol)).attributes.get

val features = featureAttrs.map(_.name.get)

// Add "(Intercept)" to list of feature names if the model was fit with an intercept
val featureNames: Array[String] = if (lrModel.getFitIntercept) {
  Array("(Intercept)") ++ features
} else {
  features
}

// Get array of coefficients
val lrModelCoeffs = lrModel.coefficients.toArray
val coeffs = if (lrModel.getFitIntercept) 
        {lrModelCoeffs ++ Array(lrModel.intercept)} 
    else 
        {lrModelCoeffs}

// Print feature names & coefficients together
//println("Coefficient   Feature")
println("==============================================================================")

val lrFeatureContributions = sc.parallelize(featureNames.zip(coeffs))
    .toDF("feature", "logistic regression coeff")
    .sort(desc("logistic regression coeff"))
val numRows = lrFeatureContributions.count().toInt
lrFeatureContributions.show(numRows,truncate=false)
//Save Coefficients to File for the Group Report
lrFeatureContributions.coalesce(1) //Join all partitions into one file
      .write
      .option("header","true")
      .option("sep",",")
      .mode("overwrite")
      .csv("lrCoeffs.csv")
//lrPredictions.select("prediction", "label", "features").show(20)
getConfusionMatrix(lrPredictions)



+---------------------------------------------------------------------------+-------------------------+
|feature                                                                    |logistic regression coeff|
+---------------------------------------------------------------------------+-------------------------+
|nonWeatherFeatureVectors_Airline_Vec_QantasLink                            |0.3791988268950665       |
|nonWeatherFeatureVectors_Airline_Vec_Qantas                                |0.3581107404817141       |
|nonWeatherFeatureVectors_Airline_Vec_Virgin Australia - ATR/F100 Operations|0.31936727784922636      |
|nonWeatherFeatureVectors_Airline_Vec_Skywest                               |0.256363087172204        |
|nonWeatherFeatureVectors_Airline_Vec_Jetstar                               |0.031460741155436935     |
|nonWeatherFeatureVectors_Fuel_Price                                        |9.613348562595535E-4     |
|nonWeatherFeatureVectors_routeID                               

lrPredictions: org.apache.spark.sql.DataFrame = [Airline: string, label: int ... 84 more fields]
schema: org.apache.spark.sql.types.StructType = StructType(StructField(Airline,StringType,true), StructField(label,IntegerType,true), StructField(Year,IntegerType,true), StructField(Month_Num,StringType,true), StructField(Fuel_Price,DoubleType,true), StructField(Departing_Port_station_ID,StringType,true), StructField(Departing_Port_station_name,StringType,true), StructField(Arriving_Port_station_ID,StringType,true), StructField(Arriving_Port_station_name,StringType,true), StructField(Mean_3pm_cloud_cover_oktas_Depart,DoubleType,true), StructField(Mean_3pm_dew_point_temperature_Degrees_C_Depart,DoubleType,true), StructField(Mean_3pm_relative_humidity_%_Depart,DoubleType,true), StructField(Mea...

## Store The Best CV Trained Logistic Model to the hdfs

In [15]:
! hadoop fs -rm    -r  hdfs://localhost:9000/tmp/flightDelayModel_*
! hadoop fs -mkdir -p  hdfs://localhost:9000/tmp/flightDelayModel__out

rm: `hdfs://localhost:9000/tmp/flightDelayModel_*': No such file or directory




In [16]:
//Persist the Model to the hdfs
pipelineModel
    .write
    .overwrite()
    .save("hdfs://localhost:9000/tmp/flightDelayModel__out")


In [17]:
//Check the stored model, by reading it back in, and running a prediciton
val results: DataFrame = CrossValidatorModel
.load("hdfs://localhost:9000/tmp/flightDelayModel__out")
.transform(testing)
.select(
    col("features"),
    col("label"),
    col("prediction")
)

results.show(2,truncate = false)

+------------------------------------------------------------+-----+----------+
|features                                                    |label|prediction|
+------------------------------------------------------------+-----+----------+
|(13,[0,4,10,11,12],[11.78978030458197,1.0,2.78,103.0,184.0])|1    |1.0       |
|(13,[0,4,10,11,12],[11.78978030458197,1.0,2.78,103.0,184.0])|1    |1.0       |
+------------------------------------------------------------+-----+----------+
only showing top 2 rows



results: org.apache.spark.sql.DataFrame = [features: vector, label: int ... 1 more field]
