In [100]:
from pyspark.sql.session import SparkSession
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import expr
from pyspark.sql import functions as F
from pyspark.ml.stat import Summarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from helpers.helper_functions import translate_to_file_string

In [101]:
inputFile = translate_to_file_string("../data/Flight_Delay_Jan_2020_ontime.csv")

In [102]:
spark = (SparkSession
       .builder
       .appName("FlightDelay")
       .getOrCreate())

In [103]:
pysparkDF = spark.read.option("header", "true") \
        .option("inferSchema", "true") \
        .option("delimiter", ",") \
        .csv(inputFile)

pysparkDF.printSchema()

root
 |-- DAY_OF_MONTH: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- OP_UNIQUE_CARRIER: string (nullable = true)
 |-- OP_CARRIER_AIRLINE_ID: integer (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- TAIL_NUM: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- ORIGIN_AIRPORT_SEQ_ID: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- DEST_AIRPORT_SEQ_ID: integer (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEP_TIME: integer (nullable = true)
 |-- DEP_DEL15: double (nullable = true)
 |-- DEP_TIME_BLK: string (nullable = true)
 |-- ARR_TIME: integer (nullable = true)
 |-- ARR_DEL15: double (nullable = true)
 |-- CANCELLED: double (nullable = true)
 |-- DIVERTED: double (nullable = true)
 |-- DISTANCE: double (nullable = true)
 |-- _c21: string (nullable = true)



### Remove faulty features

In [104]:
pysparkDF = pysparkDF.drop('_c21')
pysparkDF.printSchema()

root
 |-- DAY_OF_MONTH: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- OP_UNIQUE_CARRIER: string (nullable = true)
 |-- OP_CARRIER_AIRLINE_ID: integer (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- TAIL_NUM: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- ORIGIN_AIRPORT_SEQ_ID: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- DEST_AIRPORT_SEQ_ID: integer (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEP_TIME: integer (nullable = true)
 |-- DEP_DEL15: double (nullable = true)
 |-- DEP_TIME_BLK: string (nullable = true)
 |-- ARR_TIME: integer (nullable = true)
 |-- ARR_DEL15: double (nullable = true)
 |-- CANCELLED: double (nullable = true)
 |-- DIVERTED: double (nullable = true)
 |-- DISTANCE: double (nullable = true)



### Remove records containing NULL values

In [105]:
pysparkDF_nonull = pysparkDF.dropna()
f"Removed {pysparkDF.count()-pysparkDF_nonull.count()} records containing NULL values"

'Removed 8078 records containing NULL values'

### Build String indexer for TAIL_NUM

In [106]:
tailNum_Indexer = StringIndexer().setInputCol("TAIL_NUM").setOutputCol("TAIL_NUM_ID").fit(pysparkDF_nonull)
pysparkDF_indexed = tailNum_Indexer.transform(pysparkDF_nonull)

### Define label columns

In [107]:
labelCols = ["DEP_DEL15","ARR_DEL15"]
labelCols

['DEP_DEL15', 'ARR_DEL15']

### Remove redundant features and labels for unconditional prediction
-> Unconditional is referring to predicting each of the labels without having information on the current status of the flight (Use-Case: Checking the day before)

In [108]:
# Remove strings from id/string pairs (redundant)
# Remark: since in this dataset both string and id exist already, no further preprocessing via string_indexer is necessary. Otherwise, strings would have first been converted to ids via string_indexer.
featureCols_unconditional = pysparkDF_indexed.columns.copy()
featureCols_unconditional.remove("TAIL_NUM") # -> TAIL_NUM_ID
featureCols_unconditional.remove("OP_UNIQUE_CARRIER") # -> OP_CARRIER_AIRLINE_ID
featureCols_unconditional.remove("OP_CARRIER") # -> OP_CARRIER_AIRLINE_ID
featureCols_unconditional.remove("ORIGIN") # -> ORIGIN_AIRPORT_ID
featureCols_unconditional.remove("ORIGIN_AIRPORT_SEQ_ID") # -> ORIGIN_AIRPORT_ID
featureCols_unconditional.remove("DEST") # -> DEST_AIRPORT_SEQ_ID
featureCols_unconditional.remove("DEST_AIRPORT_SEQ_ID") # -> DEST_AIRPORT_SEQ_ID
featureCols_unconditional.remove("DEP_TIME_BLK") # -> preliminary elimination, check if model works better with binned values or not

for label in labelCols:
    featureCols_unconditional.remove(label)
                                     
featureCols_unconditional                         

['DAY_OF_MONTH',
 'DAY_OF_WEEK',
 'OP_CARRIER_AIRLINE_ID',
 'OP_CARRIER_FL_NUM',
 'ORIGIN_AIRPORT_ID',
 'DEST_AIRPORT_ID',
 'DEP_TIME',
 'ARR_TIME',
 'CANCELLED',
 'DIVERTED',
 'DISTANCE',
 'TAIL_NUM_ID']

In [109]:
featureCols_unconditional.remove("CANCELLED")
featureCols_unconditional.remove("DIVERTED")

### Remove redundant features and labels for conditional prediction
-> Conditional is referring to predicting each of the labels considering available real-time information on the current status of the flight (Use-Case: Checking while at the airport, pre-flight)

One would expect that prediction performance is increased when the model is aware of the current flight status (=DEP_DEL15)

Example: If the model is aware that the flight has departure delay, it might be able to better predict whether it will also be delayed at arrival

In [110]:
# Remove strings from id/string pairs (redundant)
# Remark: since in this dataset both string and id exist already, no further preprocessing via string_indexer is necessary. Otherwise, strings would have first been converted to ids via string_indexer.
featureCols_conditional = pysparkDF_indexed.columns.copy()
featureCols_conditional.remove("TAIL_NUM") # -> TAIL_NUM_ID
featureCols_conditional.remove("OP_UNIQUE_CARRIER") # -> OP_CARRIER_AIRLINE_ID
featureCols_conditional.remove("OP_CARRIER") # -> OP_CARRIER_AIRLINE_ID
featureCols_conditional.remove("ORIGIN") # -> ORIGIN_AIRPORT_ID
featureCols_conditional.remove("ORIGIN_AIRPORT_SEQ_ID") # -> ORIGIN_AIRPORT_ID
featureCols_conditional.remove("DEST") # -> DEST_AIRPORT_SEQ_ID
featureCols_conditional.remove("DEST_AIRPORT_SEQ_ID") # -> DEST_AIRPORT_SEQ_ID
featureCols_conditional.remove("DEP_TIME_BLK") # -> preliminary elimination, check if model works better with binned values or not

for label in [label for label in labelCols if label!="DEP_DEL15"]:
    featureCols_conditional.remove(label)
    
featureCols_conditional

['DAY_OF_MONTH',
 'DAY_OF_WEEK',
 'OP_CARRIER_AIRLINE_ID',
 'OP_CARRIER_FL_NUM',
 'ORIGIN_AIRPORT_ID',
 'DEST_AIRPORT_ID',
 'DEP_TIME',
 'DEP_DEL15',
 'ARR_TIME',
 'CANCELLED',
 'DIVERTED',
 'DISTANCE',
 'TAIL_NUM_ID']

### Build and apply feature column assembler for both featureCols

In [111]:
assembler_unconditional =  VectorAssembler(outputCol="features", inputCols=list(featureCols_unconditional))
assembler_conditional =  VectorAssembler(outputCol="features", inputCols=list(featureCols_conditional))

featureSet_unconditional = assembler_unconditional.transform(pysparkDF_indexed)
featureSet_conditional = assembler_conditional.transform(pysparkDF_indexed)

# Define same base-scaler for both feature cols
scaler = StandardScaler(inputCol="features",
                        outputCol="scaledFeatures",
                        withStd=True, 
                        withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel_unconditional = scaler.fit(featureSet_unconditional)
scalerModel_conditional = scaler.fit(featureSet_conditional)

scaledFeatureSet_unconditional = scalerModel_unconditional.transform(featureSet_unconditional)
scaledFeatureSet_conditional = scalerModel_conditional.transform(featureSet_conditional)

In [112]:
def getRatio(df):
    delayedDf = df.filter("DEP_DEL15=1.0")
    sampleRatio = delayedDf.count() / df.count()
    return sampleRatio

In [113]:
ratioOfDelayed = getRatio(training_unconditional)
delayedWeight  = 1 - ratioOfDelayed
nonDelayedWeight = ratioOfDelayed
print(delayedWeight)
print(nonDelayedWeight)

0.8643811376033455
0.13561886239665455


In [114]:
featureSet_unconditional

DataFrame[DAY_OF_MONTH: int, DAY_OF_WEEK: int, OP_UNIQUE_CARRIER: string, OP_CARRIER_AIRLINE_ID: int, OP_CARRIER: string, TAIL_NUM: string, OP_CARRIER_FL_NUM: int, ORIGIN_AIRPORT_ID: int, ORIGIN_AIRPORT_SEQ_ID: int, ORIGIN: string, DEST_AIRPORT_ID: int, DEST_AIRPORT_SEQ_ID: int, DEST: string, DEP_TIME: int, DEP_DEL15: double, DEP_TIME_BLK: string, ARR_TIME: int, ARR_DEL15: double, CANCELLED: double, DIVERTED: double, DISTANCE: double, TAIL_NUM_ID: double, features: vector]

In [115]:
weightedDF = scaledFeatureSet_unconditional.withColumn("DEP_DEL15_weighted", F.when(scaledFeatureSet_unconditional["DEP_DEL15"]==("1.0"),delayedWeight).otherwise(nonDelayedWeight))
weightedDF

DataFrame[DAY_OF_MONTH: int, DAY_OF_WEEK: int, OP_UNIQUE_CARRIER: string, OP_CARRIER_AIRLINE_ID: int, OP_CARRIER: string, TAIL_NUM: string, OP_CARRIER_FL_NUM: int, ORIGIN_AIRPORT_ID: int, ORIGIN_AIRPORT_SEQ_ID: int, ORIGIN: string, DEST_AIRPORT_ID: int, DEST_AIRPORT_SEQ_ID: int, DEST: string, DEP_TIME: int, DEP_DEL15: double, DEP_TIME_BLK: string, ARR_TIME: int, ARR_DEL15: double, CANCELLED: double, DIVERTED: double, DISTANCE: double, TAIL_NUM_ID: double, features: vector, scaledFeatures: vector, DEP_DEL15_weighted: double]

## Modelling
### Split data into training and test set
Die Aufteilung der Daten erfolgt in 80% Trainingsdaten und 20% Testdaten.

In [116]:
splits_unconditional = weightedDF.randomSplit([0.8, 0.2], 12345)
training_unconditional = splits_unconditional[0]
test_unconditional = splits_unconditional[1]

splits_conditional= scaledFeatureSet_conditional.randomSplit([0.8, 0.2], 12345)
training_conditional = splits_conditional[0]
test_conditional = splits_conditional[1]

In [117]:
training_unconditional.printSchema()

root
 |-- DAY_OF_MONTH: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- OP_UNIQUE_CARRIER: string (nullable = true)
 |-- OP_CARRIER_AIRLINE_ID: integer (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- TAIL_NUM: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- ORIGIN_AIRPORT_SEQ_ID: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- DEST_AIRPORT_SEQ_ID: integer (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEP_TIME: integer (nullable = true)
 |-- DEP_DEL15: double (nullable = true)
 |-- DEP_TIME_BLK: string (nullable = true)
 |-- ARR_TIME: integer (nullable = true)
 |-- ARR_DEL15: double (nullable = true)
 |-- CANCELLED: double (nullable = true)
 |-- DIVERTED: double (nullable = true)
 |-- DISTANCE: double (nullable = true)
 |-- TAIL_NUM_ID: double (nullable = false)
 |-- features: vecto

### Build and train the Logistic Regression Model

Um eine Logistische Regression durchführen zu können, muss zunächst das zugehörige Element `LogisticRegression` aus der Bibliothek `pyspark.ml.classification` importiert werden.

In [118]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics

In [119]:
model_instance = LogisticRegression(
               featuresCol="scaledFeatures",
               labelCol="DEP_DEL15",
               standardization=False)
paramGrid = ParamGridBuilder().addGrid(model_instance.maxIter, [20])\
                 .addGrid(model_instance.regParam, [0.7]) \
                 .addGrid(model_instance.elasticNetParam, [1.0]) \
                 .build()
params = ["maxIter","regParam","elasticNetParam"]

evaluator = BinaryClassificationEvaluator(labelCol="DEP_DEL15")
cv = CrossValidator(estimator=model_instance, evaluator=evaluator, \
              estimatorParamMaps=paramGrid, numFolds=5, parallelism=2)
cvModel = cv.fit(training_unconditional)

# Predict and evaluate
predictions = cvModel.transform(test_unconditional)
predictionAndLabels = predictions.select(predictions.prediction, "DEP_DEL15")

countcorrect = predictionAndLabels.filter(f"DEP_DEL15 == prediction").count()
countincorrect = predictionAndLabels.filter(f"DEP_DEL15 != prediction").count()
countall = predictionAndLabels.count()
accuracy = countcorrect/countall
print(f"count correct: {countcorrect}")
print(f"count incorrect: {countincorrect}")
print(f"count all: {countall}")
print(f"accuracy: {accuracy}")
print(f"Test Error {1-accuracy}")

predictionAndLabels = predictions.select("prediction", label).rdd.map(lambda p: [p[0], float(p[1])]) # Map to RDD prediction|label
metrics =  MulticlassMetrics(predictionAndLabels)
confusion = metrics.confusionMatrix()            
print("Confusion matrix: \n" , confusion)

count correct: 103594
count incorrect: 16456
count all: 120050
accuracy: 0.862923781757601
Test Error 0.13707621824239902
Confusion matrix: 
 DenseMatrix([[103423.,      0.],
             [ 16627.,      0.]])


In [120]:
training_unconditional.head(10)

[Row(DAY_OF_MONTH=1, DAY_OF_WEEK=3, OP_UNIQUE_CARRIER='9E', OP_CARRIER_AIRLINE_ID=20363, OP_CARRIER='9E', TAIL_NUM='N131EV', OP_CARRIER_FL_NUM=4867, ORIGIN_AIRPORT_ID=11423, ORIGIN_AIRPORT_SEQ_ID=1142307, ORIGIN='DSM', DEST_AIRPORT_ID=13487, DEST_AIRPORT_SEQ_ID=1348702, DEST='MSP', DEP_TIME=1347, DEP_DEL15=1.0, DEP_TIME_BLK='1300-1359', ARR_TIME=1445, ARR_DEL15=0.0, CANCELLED=0.0, DIVERTED=0.0, DISTANCE=232.0, TAIL_NUM_ID=1192.0, features=DenseVector([1.0, 3.0, 20363.0, 4867.0, 11423.0, 13487.0, 1347.0, 1445.0, 232.0, 1192.0]), scaledFeatures=DenseVector([0.1109, 1.569, 54.271, 2.672, 7.4914, 8.8442, 2.71, 2.7502, 0.3943, 0.8524]), DEP_DEL15_weighted=0.8643811376033455),
 Row(DAY_OF_MONTH=1, DAY_OF_WEEK=3, OP_UNIQUE_CARRIER='9E', OP_CARRIER_AIRLINE_ID=20363, OP_CARRIER='9E', TAIL_NUM='N131EV', OP_CARRIER_FL_NUM=5049, ORIGIN_AIRPORT_ID=13487, ORIGIN_AIRPORT_SEQ_ID=1348702, ORIGIN='MSP', DEST_AIRPORT_ID=11423, DEST_AIRPORT_SEQ_ID=1142307, DEST='DSM', DEP_TIME=1215, DEP_DEL15=1.0, DEP_TIM

In [121]:
model_instance = LogisticRegression(
               featuresCol="scaledFeatures",
               labelCol="DEP_DEL15",
               weightCol="DEP_DEL15_weighted")
paramGrid = ParamGridBuilder().addGrid(model_instance.maxIter, [50])\
                 .addGrid(model_instance.regParam, [0.1]) \
                 .addGrid(model_instance.elasticNetParam, [0.0]) \
                 .build()
params = ["maxIter","regParam","elasticNetParam"]

evaluator = BinaryClassificationEvaluator(labelCol="DEP_DEL15")
cv = CrossValidator(estimator=model_instance, evaluator=evaluator, \
              estimatorParamMaps=paramGrid, numFolds=5, parallelism=2)
cvModel = cv.fit(training_unconditional)

# Predict and evaluate
predictions = cvModel.transform(test_unconditional)
predictionAndLabels = predictions.select(predictions.prediction, "DEP_DEL15")

countcorrect = predictionAndLabels.filter(f"DEP_DEL15 == prediction").count()
countincorrect = predictionAndLabels.filter(f"DEP_DEL15 != prediction").count()
countall = predictionAndLabels.count()
accuracy = countcorrect/countall
print(f"count correct: {countcorrect}")
print(f"count incorrect: {countincorrect}")
print(f"count all: {countall}")
print(f"accuracy: {accuracy}")
print(f"Test Error {1-accuracy}")

predictionAndLabels = predictions.select("prediction", label).rdd.map(lambda p: [p[0], float(p[1])]) # Map to RDD prediction|label
metrics =  MulticlassMetrics(predictionAndLabels)
confusion = metrics.confusionMatrix()            
print("Confusion matrix: \n" , confusion)

count correct: 71268
count incorrect: 48782
count all: 120050
accuracy: 0.5936526447313619
Test Error 0.4063473552686381
Confusion matrix: 
 DenseMatrix([[60583., 42840.],
             [ 6883.,  9744.]])


In [122]:
from pyspark.sql.types import *
schema = StructType([
      StructField('model', StringType(), True),
      StructField('target label', StringType(), True),
      StructField('mode', StringType(), True),
      StructField('param_config', StringType(), True),
      StructField('accuracy', FloatType(), True),
      StructField('test error', FloatType(), True)
  ])

evalDF = spark.createDataFrame([], schema)
evalDF.show()

+-----+------------+----+------------+--------+----------+
|model|target label|mode|param_config|accuracy|test error|
+-----+------------+----+------------+--------+----------+
+-----+------------+----+------------+--------+----------+



In [123]:



models = ["LogisticRegression"]

for model in models:
    for label in labelCols:
        for test, train, mode, features in zip([test_unconditional, test_conditional],[training_unconditional,training_conditional],["Unconditional","Conditional"],[featureCols_unconditional,featureCols_conditional]):
            # Skip invalid combinations
            if label=="DEP_DEL15" and mode=="Conditional":
                continue

            # Print Model Spec
            print("\n\n----------MODEL SPEC----------")
            print(f"Model Type: {model}")
            print(f"Target Label: {label}")
            print(f"Prediction Mode: {mode}")

            # Define Model
            if model=="LogisticRegression":
                # Define LogisticRegression Classifier acc. to current param
                model_instance = LogisticRegression(
                               featuresCol="scaledFeatures",
                               labelCol=label,
                               standardization=False)
                paramGrid = ParamGridBuilder().addGrid(model_instance.maxIter, [20])\
                                 .addGrid(model_instance.regParam, [0.7]) \
                                 .addGrid(model_instance.elasticNetParam, [1.0]) \
                                 .build()
                params = ["maxIter","regParam","elasticNetParam"]
            if model=="SVM":
                # Define SVM Classifier acc. to current param
                pass

            evaluator = BinaryClassificationEvaluator(labelCol=label)
            cv = CrossValidator(estimator=model_instance, evaluator=evaluator, \
                          estimatorParamMaps=paramGrid, numFolds=5, parallelism=2)
            cvModel = cv.fit(train)
            model_best = cvModel.bestModel
            param_print = '\n'.join([line for line in model_best.explainParams().split('\n') if line.split(":")[0] in params])
            print("Chosen parameters: \n" + param_print)

            print(str(model) + " Coefficients: " + str(dict(zip(features,["{:.4f}".format(a) for a in model_best.coefficients]))))
            print(str(model) + " Intercept: " + "{:.4f}".format(model_best.intercept))

            # Predict and evaluate
            predictions = cvModel.transform(test)
            predictionAndLabels = predictions.select(predictions.prediction, label)

            countcorrect = predictionAndLabels.filter(f"{label} == prediction").count()
            countincorrect = predictionAndLabels.filter(f"{label} != prediction").count()
            countall = predictionAndLabels.count()
            accuracy = countcorrect/countall
            print(f"count correct: {countcorrect}")
            print(f"count incorrect: {countincorrect}")
            print(f"count all: {countall}")
            print(f"accuracy: {accuracy}")
            print(f"Test Error {1-accuracy}")
            
            predictionAndLabels = predictions.select("prediction", label).rdd.map(lambda p: [p[0], float(p[1])]) # Map to RDD prediction|label
            metrics =  MulticlassMetrics(predictionAndLabels)
            confusion = metrics.confusionMatrix()            
            print("Confusion matrix: \n" , confusion)
            
            print("--------------------")

            newRow = spark.createDataFrame([(model,label,mode,param_print,accuracy,1-accuracy)], schema)
            evalDF = evalDF.union(newRow)



----------MODEL SPEC----------
Model Type: LogisticRegression
Target Label: DEP_DEL15
Prediction Mode: Unconditional
Chosen parameters: 
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0, current: 1.0)
maxIter: max number of iterations (>= 0). (default: 100, current: 20)
regParam: regularization parameter (>= 0). (default: 0.0, current: 0.7)
LogisticRegression Coefficients: {'DAY_OF_MONTH': '0.0000', 'DAY_OF_WEEK': '0.0000', 'OP_CARRIER_AIRLINE_ID': '0.0000', 'OP_CARRIER_FL_NUM': '0.0000', 'ORIGIN_AIRPORT_ID': '0.0000', 'DEST_AIRPORT_ID': '0.0000', 'DEP_TIME': '0.0000', 'ARR_TIME': '0.0000', 'DISTANCE': '0.0000', 'TAIL_NUM_ID': '0.0000'}
LogisticRegression Intercept: -1.8522
count correct: 103594
count incorrect: 16456
count all: 120050
accuracy: 0.862923781757601
Test Error 0.13707621824239902
Confusion matrix: 
 DenseMatrix([[103594.,      0.],
             [ 16456.,     

In [124]:
evalDF.show(20,False)

+------------------+------------+-------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+----------+
|model             |target label|mode         |param_config                                                                                                                                                                                                                                                                                                                  |accuracy  |test error|
+------------------+------------+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [125]:
#spark.stop()