In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

conf = SparkConf()
conf.setAppName("ML Data Preparation")
conf.setMaster("local")
conf.set("spark.hadoop.fs.defaultFS", "file:///")
sc = SparkContext.getOrCreate(conf)
sc.setLogLevel("ERROR")
spark = SparkSession.builder.appName("App").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/15 23:37:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
dataframe = spark.read.parquet("data_processed_job/2007")

In [3]:
dataframe.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- ArrDelay: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- DepTimeT: integer (nullable = true)
 |-- CRSDepTimeT: integer (nullable = true)
 |-- CRSArrTimeT: integer (nullable = true)
 |-- PunctualCarrier: integer (nullable = true)
 |-- AverageCarrier: integer (nullable = true)



In [4]:
# Define groups of columns for preprocessing
TARGET_COL = "ArrDelay"
MINMAX_COLS = ["Month", "DayofMonth", "DepTimeT", "CRSDepTimeT", "CRSArrTimeT"]
STANDARD_COLS = ["DepDelay", "CRSElapsedTime"]
ONE_HOT_COLS = ["PunctualCarrier", "AverageCarrier"]
ALL_COLS = MINMAX_COLS + STANDARD_COLS + ONE_HOT_COLS

# ++++ Define stages for the pipeline ++++
minmax_assembler = VectorAssembler(inputCols=MINMAX_COLS, outputCol="minmax_features")
minmax_scaler = MinMaxScaler(inputCol="minmax_features", outputCol="scaled_minmax_features")

standard_assembler = VectorAssembler(inputCols=STANDARD_COLS, outputCol="standard_features")
standard_scaler = StandardScaler(inputCol="standard_features", outputCol="scaled_standard_features", withMean=True, withStd=True)

final_assembler = VectorAssembler(
    inputCols=["scaled_minmax_features", "scaled_standard_features"] + ONE_HOT_COLS,
    outputCol="features"
)

lr = LinearRegression(featuresCol="features", labelCol="label")

# ++++ Create a pipeline ++++
pipeline = Pipeline(stages=[
    minmax_assembler, 
    minmax_scaler, 
    standard_assembler, 
    standard_scaler, 
    final_assembler, 
    lr
])

# ++++ Prepare the data ++++
data = dataframe.select(*(ALL_COLS + [TARGET_COL])).withColumnRenamed(TARGET_COL, "label") # .dropna()

# ++++ Fit the pipeline ++++
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1000)
model = pipeline.fit(train_data)

24/12/15 23:38:10 WARN Instrumentation: [128b613a] regParam is zero, which might cause numerical instability and overfitting.
24/12/15 23:38:24 WARN Instrumentation: [128b613a] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
                                                                                

Display the coefficients learned by the model.

In [5]:
lr_model = model.stages[-1]

print(f"{'Feature':<15}{'Coefficient':>10}")
print(f"{'Intercept':<15}{lr_model.intercept:>10.2f}")
for i, coefficient in enumerate(lr_model.coefficients):
    print(f"{ALL_COLS[i]:<15}{coefficient:>10.2f}")

Feature        Coefficient
Intercept           10.94
Month               -0.31
DayofMonth           0.19
DepTimeT             5.41
CRSDepTimeT         -7.52
CRSArrTimeT          1.02
DepDelay            36.57
CRSElapsedTime      -0.81
PunctualCarrier      0.18
AverageCarrier      -0.18


Do the same, but this time without scaling the features.

In [6]:
TARGET_COL = "ArrDelay"
FEATURE_COLS = ["Month", "DayofMonth", "DepTimeT", "CRSDepTimeT", "CRSArrTimeT", "DepDelay", "CRSElapsedTime", "PunctualCarrier", "AverageCarrier"]

vector_assembler = VectorAssembler(
    inputCols=FEATURE_COLS,
    outputCol="features"
)

lr = LinearRegression(featuresCol="features", labelCol="label")

pipeline = Pipeline(stages=[
    vector_assembler, 
    lr
])

data = dataframe.select(*(FEATURE_COLS + [TARGET_COL])).withColumnRenamed(TARGET_COL, "label")

train_data, test_data = data.randomSplit([0.8, 0.2], seed=1000)
model = pipeline.fit(train_data)

24/12/15 23:40:40 WARN Instrumentation: [efd37cfa] regParam is zero, which might cause numerical instability and overfitting.
24/12/15 23:40:49 WARN Instrumentation: [efd37cfa] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
                                                                                

In [7]:
lr_model = model.stages[-1]

print(f"{'Feature':<15}{'Coefficient':>10}")
print(f"{'Intercept':<15}{lr_model.intercept:>10.2f}")
for i, coefficient in enumerate(lr_model.coefficients):
    print(f"{FEATURE_COLS[i]:<15}{coefficient:>10.2f}")

Feature        Coefficient
Intercept            0.91
Month               -0.03
DayofMonth           0.01
DepTimeT             0.00
CRSDepTimeT         -0.01
CRSArrTimeT          0.00
DepDelay             1.02
CRSElapsedTime      -0.01
PunctualCarrier      0.18
AverageCarrier      -0.18


In [7]:
# TODO: Interpretation of results

In [8]:
# TODO: Evaluate the model with RMSE, R^2