In [27]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, StandardScaler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

conf = SparkConf()
conf.setAppName("ML Data Preparation")
conf.setMaster("local")
conf.set("spark.hadoop.fs.defaultFS", "file:///")
sc = SparkContext.getOrCreate(conf)
sc.setLogLevel("ERROR")
spark = SparkSession.builder.appName("App").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

In [4]:
dataframe = spark.read.parquet("data_processed_job/2007")

# Rename column "ArrDelay" to "label", as this is the default in Spark
dataframe = dataframe.withColumn("label", col("ArrDelay")).drop("ArrDelay")

In [5]:
dataframe.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- DepTimeT: integer (nullable = true)
 |-- CRSDepTimeT: integer (nullable = true)
 |-- CRSArrTimeT: integer (nullable = true)
 |-- PunctualCarrier: integer (nullable = true)
 |-- AverageCarrier: integer (nullable = true)
 |-- label: integer (nullable = true)



First, we split our data into a *train set* (80%) and *test set* (20%).

In [6]:
train_data, test_data = dataframe.randomSplit([0.8, 0.2], seed=3)

# Linear Regression

## On Raw Features

In [7]:
FEATURE_COLS = ["Month", "DayofMonth", "DepTimeT", "CRSDepTimeT", "CRSArrTimeT", "DepDelay", "CRSElapsedTime", "PunctualCarrier", "AverageCarrier"]

vector_assembler = VectorAssembler(
    inputCols=FEATURE_COLS,
    outputCol="features"
)

lr = LinearRegression()

pipeline_lr = Pipeline(stages=[
    vector_assembler, 
    lr
])

train_data_lr = train_data.select(*(FEATURE_COLS + ["label"]))
model_lr = pipeline_lr.fit(train_data_lr)

24/12/17 21:43:29 WARN Instrumentation: [02995743] regParam is zero, which might cause numerical instability and overfitting.
24/12/17 21:43:40 WARN Instrumentation: [02995743] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
                                                                                

## On Normalized Features

In [8]:
# Define groups of columns for preprocessing
# Use Min-Max Scaling for features with approximately uniform distribution
MINMAX_COLS = ["Month", "DayofMonth", "DepTimeT", "CRSDepTimeT", "CRSArrTimeT"]
# Use Standard Scaling for features which deviate from a mean
STANDARD_COLS = ["DepDelay", "CRSElapsedTime"]
# One-Hot encoding was already performed in the preparation phase
ONE_HOT_COLS = ["PunctualCarrier", "AverageCarrier"]
ALL_COLS = MINMAX_COLS + STANDARD_COLS + ONE_HOT_COLS

# ++++ Define stages for the pipeline ++++
minmax_assembler = VectorAssembler(inputCols=MINMAX_COLS, outputCol="minmax_features")
minmax_scaler = MinMaxScaler(inputCol="minmax_features", outputCol="scaled_minmax_features")

standard_assembler = VectorAssembler(inputCols=STANDARD_COLS, outputCol="standard_features")
standard_scaler = StandardScaler(inputCol="standard_features", outputCol="scaled_standard_features", withMean=True, withStd=True)

final_assembler = VectorAssembler(
    inputCols=["scaled_minmax_features", "scaled_standard_features"] + ONE_HOT_COLS,
    outputCol="features"
)

lr_normalized = LinearRegression()

# ++++ Create a pipeline ++++
pipeline_lr_normalized = Pipeline(stages=[
    minmax_assembler, 
    minmax_scaler, 
    standard_assembler, 
    standard_scaler, 
    final_assembler, 
    lr_normalized
])

# ++++ Prepare the data ++++
train_data_lr_normalized = train_data.select(*(ALL_COLS + ["label"]))

# ++++ Fit the pipeline ++++
model_lr_normalized = pipeline_lr_normalized.fit(train_data_lr_normalized)

24/12/17 21:44:17 WARN Instrumentation: [155a8349] regParam is zero, which might cause numerical instability and overfitting.
                                                                                

## Interpretation & Evaluation

Display the coefficients learned by the models (raw + normalized features).

In [14]:
model_lr_model = model_lr.stages[-1]
model_lr_normalized_model = model_lr_normalized.stages[-1]

print(f"{'Feature':<15}{'Raw Features':>15}{'Normalized Features':>20}")
print(f"{'Intercept':<15}{model_lr_model.intercept:>15.2f}{model_lr_normalized_model.intercept:>20.2f}")

for i in range(len(ALL_COLS)):
    print(f"{ALL_COLS[i]:<15}{model_lr_model.coefficients[i]:>15.2f}{model_lr_normalized_model.coefficients[i]:>20.2f}")

Feature           Raw Features Normalized Features
Intercept                 0.92             9986.72
Month                    -0.03               -0.29
DayofMonth                0.01                0.18
DepTimeT                  0.00                5.29
CRSDepTimeT              -0.01               -7.40
CRSArrTimeT               0.00                0.99
DepDelay                  1.02               36.59
CRSElapsedTime           -0.01               -0.82
PunctualCarrier           0.18            -9975.59
AverageCarrier           -0.18            -9975.94


Based on the LR using the **raw features**, we can see that a `DepDelay` of one minute corresponds to about one minute of `ArrDelay`.
The coefficients of the Linear Regression with **normalized features** tells us that `DepDelay` has by far the strongest influence on `ArrDelay`. (Which coincides with our findings from the correlation matrix, as both consider linear relationships.)

In [18]:
predictions = model_lr.transform(test_data.select(*(FEATURE_COLS + ["label"])))

evaluator = RegressionEvaluator()
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R²: {r2:.4f}")

[Stage 31:>                                                         (0 + 1) / 1]

RMSE: 14.2144
MAE: 9.4322
R²: 0.8695


                                                                                

To evaluate our Linear Regression model, we compute three metrics: Root Mean Squared Error (**RMSE**), Mean Absolute Error (**MAE**), and the coefficient of determination (**R²**).

The RMSE is the most common evaluation metric of regression models, and it penalizes greater errors stronger. We will use it later to compare the performances between our models and choose the best-performing one.

The MAE is more robust to outliers. Our LR model's predictions are off by 9.43 minutes on average.
R² measures the percentage of variance which can be explained by our model—about 86.9%—the rest is noise which cannot be explained by our model.

# Decision Tree

In [15]:
FEATURE_COLS = ["Month", "DayofMonth", "DepTimeT", "CRSDepTimeT", "CRSArrTimeT", "DepDelay", "CRSElapsedTime", "PunctualCarrier", "AverageCarrier"]

vector_assembler = VectorAssembler(inputCols=FEATURE_COLS, outputCol="features")
dt = DecisionTreeRegressor()
pipeline_dt = Pipeline(stages=[vector_assembler, dt])

train_data_dt = train_data.select(*(FEATURE_COLS + ["label"]))
model_dt = pipeline_dt.fit(train_data_dt)

24/12/17 21:48:17 WARN MemoryStore: Not enough space to cache rdd_110_0 in memory! (computed 105.0 MiB so far)
24/12/17 21:48:17 WARN BlockManager: Persisting block rdd_110_0 to disk instead.
24/12/17 21:48:24 WARN MemoryStore: Not enough space to cache rdd_110_0 in memory! (computed 249.6 MiB so far)
24/12/17 21:48:26 WARN MemoryStore: Not enough space to cache rdd_110_0 in memory! (computed 249.6 MiB so far)
24/12/17 21:48:28 WARN MemoryStore: Not enough space to cache rdd_110_0 in memory! (computed 249.6 MiB so far)
24/12/17 21:48:31 WARN MemoryStore: Not enough space to cache rdd_110_0 in memory! (computed 249.6 MiB so far)
24/12/17 21:48:33 WARN MemoryStore: Not enough space to cache rdd_110_0 in memory! (computed 249.6 MiB so far)
                                                                                

In [None]:
print(model_dt.stages[-1].toDebugString)

In [None]:
predictions = model_dt.transform(test_data.select(*(FEATURE_COLS + ["label"])))

evaluator = RegressionEvaluator()
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R²: {r2:.4f}")

[Stage 57:>                                                         (0 + 1) / 1]

RMSE: 19.2007
MAE: 10.7199
R²: 0.7619


                                                                                

# Random Forest

In [24]:
FEATURE_COLS = ["Month", "DayofMonth", "DepTimeT", "CRSDepTimeT", "CRSArrTimeT", "DepDelay", "CRSElapsedTime", "PunctualCarrier", "AverageCarrier"]

vector_assembler = VectorAssembler(inputCols=FEATURE_COLS, outputCol="features")
rf = RandomForestRegressor()
pipeline_rf = Pipeline(stages=[vector_assembler, rf])

train_data_rf = train_data.select(*(FEATURE_COLS + ["label"]))
model_rf = pipeline_rf.fit(train_data_rf)

24/12/17 21:59:34 WARN MemoryStore: Not enough space to cache rdd_271_0 in memory! (computed 93.8 MiB so far)
24/12/17 21:59:34 WARN BlockManager: Persisting block rdd_271_0 to disk instead.
24/12/17 21:59:47 WARN MemoryStore: Not enough space to cache rdd_271_0 in memory! (computed 316.5 MiB so far)
24/12/17 21:59:54 WARN MemoryStore: Not enough space to cache rdd_271_0 in memory! (computed 316.5 MiB so far)
24/12/17 22:00:03 WARN MemoryStore: Not enough space to cache rdd_271_0 in memory! (computed 316.5 MiB so far)
24/12/17 22:00:15 WARN MemoryStore: Not enough space to cache rdd_271_0 in memory! (computed 316.5 MiB so far)
24/12/17 22:00:27 WARN MemoryStore: Not enough space to cache rdd_271_0 in memory! (computed 316.5 MiB so far)
                                                                                

In [26]:
predictions = model_rf.transform(test_data.select(*(FEATURE_COLS + ["label"])))

evaluator = RegressionEvaluator()
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R²: {r2:.4f}")

[Stage 77:>                                                         (0 + 1) / 1]

RMSE: 20.6262
MAE: 12.1371
R²: 0.7252


                                                                                

# Gradient-boosted Trees

In [28]:
FEATURE_COLS = ["Month", "DayofMonth", "DepTimeT", "CRSDepTimeT", "CRSArrTimeT", "DepDelay", "CRSElapsedTime", "PunctualCarrier", "AverageCarrier"]

vector_assembler = VectorAssembler(inputCols=FEATURE_COLS, outputCol="features")
gbt = GBTRegressor()
pipeline_gbt = Pipeline(stages=[vector_assembler, gbt])

train_data_gbt = train_data.select(*(FEATURE_COLS + ["label"]))
model_gbt = pipeline_gbt.fit(train_data_rf)

24/12/17 22:48:18 WARN MemoryStore: Not enough space to cache rdd_360_0 in memory! (computed 104.8 MiB so far)
24/12/17 22:48:18 WARN BlockManager: Persisting block rdd_360_0 to disk instead.
24/12/17 22:48:24 WARN MemoryStore: Not enough space to cache rdd_360_0 in memory! (computed 353.7 MiB so far)
24/12/17 22:48:26 WARN MemoryStore: Not enough space to cache rdd_360_0 in memory! (computed 8.4 MiB so far)
24/12/17 22:48:26 WARN MemoryStore: Not enough space to cache rdd_362_0 in memory! (computed 16.0 MiB so far)
24/12/17 22:48:26 WARN BlockManager: Persisting block rdd_362_0 to disk instead.
24/12/17 22:48:27 WARN MemoryStore: Not enough space to cache rdd_362_0 in memory! (computed 16.0 MiB so far)
24/12/17 22:48:30 WARN MemoryStore: Not enough space to cache rdd_360_0 in memory! (computed 353.7 MiB so far)
24/12/17 22:48:30 WARN MemoryStore: Not enough space to cache rdd_362_0 in memory! (computed 16.0 MiB so far)
24/12/17 22:48:33 WARN MemoryStore: Not enough space to cache rdd_

In [29]:
predictions = model_gbt.transform(test_data.select(*(FEATURE_COLS + ["label"])))

evaluator = RegressionEvaluator()
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R²: {r2:.4f}")

[Stage 284:>                                                        (0 + 1) / 1]

RMSE: 16.9685
MAE: 10.1631
R²: 0.8141


                                                                                