## Importing the relevant libraries

In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression, FMRegressor, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor, GeneralizedLinearRegression, IsotonicRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder
from pyspark.ml.pipeline import Pipeline
from pyspark.sql.functions import hour, month, year, day, minute, second, weekday, weekofyear, dayofweek, dayofmonth, dayofyear, col, corr, format_number, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

In [0]:
spark = SparkSession.builder.appName("steel_energy_prediction").getOrCreate() # Initiate a Spark session

## Loading the dataset

In [0]:
data = spark.read.csv("file:/Workspace/Users/n01606417@humber.ca/Steel_industry_data.csv",inferSchema=True,header=True)
data.printSchema()

root
 |-- date: string (nullable = true)
 |-- Usage_kWh: double (nullable = true)
 |-- Lagging_Current_Reactive.Power_kVarh: double (nullable = true)
 |-- Leading_Current_Reactive_Power_kVarh: double (nullable = true)
 |-- CO2(tCO2): double (nullable = true)
 |-- Lagging_Current_Power_Factor: double (nullable = true)
 |-- Leading_Current_Power_Factor: double (nullable = true)
 |-- NSM: integer (nullable = true)
 |-- WeekStatus: string (nullable = true)
 |-- Day_of_week: string (nullable = true)
 |-- Load_Type: string (nullable = true)



In [0]:
data.count()

35040

In [0]:
data.show(5)

+----------------+---------+------------------------------------+------------------------------------+---------+----------------------------+----------------------------+----+----------+-----------+----------+
|            date|Usage_kWh|Lagging_Current_Reactive.Power_kVarh|Leading_Current_Reactive_Power_kVarh|CO2(tCO2)|Lagging_Current_Power_Factor|Leading_Current_Power_Factor| NSM|WeekStatus|Day_of_week| Load_Type|
+----------------+---------+------------------------------------+------------------------------------+---------+----------------------------+----------------------------+----+----------+-----------+----------+
|01/01/2018 00:15|     3.17|                                2.95|                                 0.0|      0.0|                       73.21|                       100.0| 900|   Weekday|     Monday|Light_Load|
|01/01/2018 00:30|      4.0|                                4.46|                                 0.0|      0.0|                       66.77|                   

## Exploratory Data Analysis (EDA)

In [0]:
data.groupBy("Load_Type").count().show()

+------------+-----+
|   Load_Type|count|
+------------+-----+
| Medium_Load| 9696|
|Maximum_Load| 7272|
|  Light_Load|18072|
+------------+-----+



In [0]:
data.groupBy("Day_of_week").count().show()

+-----------+-----+
|Day_of_week|count|
+-----------+-----+
|  Wednesday| 4992|
|    Tuesday| 4992|
|     Friday| 4992|
|   Thursday| 4992|
|   Saturday| 4992|
|     Monday| 5088|
|     Sunday| 4992|
+-----------+-----+



In [0]:
data.groupBy("WeekStatus").count().show()

+----------+-----+
|WeekStatus|count|
+----------+-----+
|   Weekday|25056|
|   Weekend| 9984|
+----------+-----+



In [0]:
data.groupBy("Day_of_week").agg({"Usage_kWh": "avg"}).orderBy("avg(Usage_kWh)",ascending=False).withColumnRenamed("avg(Usage_kWh)","avg_energy_consumption").select(["Day_of_week",format_number("avg_energy_consumption",2).alias("avg_energy_consumption")]).show()

+-----------+----------------------+
|Day_of_week|avg_energy_consumption|
+-----------+----------------------+
|   Thursday|                 35.11|
|    Tuesday|                 34.43|
|     Friday|                 33.20|
|     Monday|                 33.14|
|  Wednesday|                 32.25|
|   Saturday|                 15.92|
|     Sunday|                  7.55|
+-----------+----------------------+



Thursday has the highest average energy consumption while Sunday has the lowest average energy consumption.

In [0]:
data.groupBy("Load_Type").agg({"Usage_kWh": "avg"}).orderBy("avg(Usage_kWh)",ascending=False).withColumnRenamed("avg(Usage_kWh)","avg_energy_consumption").select(["Load_Type",format_number("avg_energy_consumption",2).alias("avg_energy_consumption")]).show()

+------------+----------------------+
|   Load_Type|avg_energy_consumption|
+------------+----------------------+
|Maximum_Load|                 59.27|
| Medium_Load|                 38.45|
|  Light_Load|                  8.63|
+------------+----------------------+



As expected, maximum load has the highest energy consumption (in kWh) whereas light load has the least energy load.

In [0]:
data.groupBy("WeekStatus").agg({"Usage_kWh": "avg"}).orderBy("avg(Usage_kWh)",ascending=False).withColumnRenamed("avg(Usage_kWh)","avg_energy_consumption").select(["WeekStatus",format_number("avg_energy_consumption",2).alias("avg_energy_consumption")]).show()

+----------+----------------------+
|WeekStatus|avg_energy_consumption|
+----------+----------------------+
|   Weekday|                 33.62|
|   Weekend|                 11.73|
+----------+----------------------+



Average energy consumed is significantly higher on weekdays as compared to the weekends.

In [0]:
data.groupBy("Load_Type") \
    .agg({"NSM": "avg"}) \
    .orderBy("avg(NSM)", ascending=False) \
    .withColumnRenamed("avg(NSM)", "avg_nsm") \
    .select(
        "Load_Type",
        format_number("avg_nsm", 2).alias("avg_nsm")
    ) \
    .show()

+------------+---------+
|   Load_Type|  avg_nsm|
+------------+---------+
| Medium_Load|61,810.40|
|Maximum_Load|53,036.14|
|  Light_Load|28,384.66|
+------------+---------+



Medium Load has the maximum NSM value while Light Load has the minimum NSM value.

## Data Visualization

In [0]:
data.createOrReplaceTempView("steel_energy") # Create a temporary view of the Spark dataframe

In [0]:
%sql
select * from steel_energy;

date,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Day_of_week,Load_Type
01/01/2018 00:15,3.17,2.95,0.0,0.0,73.21,100.0,900,Weekday,Monday,Light_Load
01/01/2018 00:30,4.0,4.46,0.0,0.0,66.77,100.0,1800,Weekday,Monday,Light_Load
01/01/2018 00:45,3.24,3.28,0.0,0.0,70.28,100.0,2700,Weekday,Monday,Light_Load
01/01/2018 01:00,3.31,3.56,0.0,0.0,68.09,100.0,3600,Weekday,Monday,Light_Load
01/01/2018 01:15,3.82,4.5,0.0,0.0,64.72,100.0,4500,Weekday,Monday,Light_Load
01/01/2018 01:30,3.28,3.56,0.0,0.0,67.76,100.0,5400,Weekday,Monday,Light_Load
01/01/2018 01:45,3.6,4.14,0.0,0.0,65.62,100.0,6300,Weekday,Monday,Light_Load
01/01/2018 02:00,3.6,4.28,0.0,0.0,64.37,100.0,7200,Weekday,Monday,Light_Load
01/01/2018 02:15,3.28,3.64,0.0,0.0,66.94,100.0,8100,Weekday,Monday,Light_Load
01/01/2018 02:30,3.78,4.72,0.0,0.0,62.51,100.0,9000,Weekday,Monday,Light_Load


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

The target feature "Usage_kWh" has a highly right skewed distribution. In addition, the feature "CO2" has a right skewed distribution as well.

NSM (Near-Surface Mounted) is considerably higher on weekdays in comparison to the weekends.

In [0]:
data.select(corr("CO2(tCO2)","Usage_kWh")).show()

+--------------------------+
|corr(CO2(tCO2), Usage_kWh)|
+--------------------------+
|        0.9881797716789519|
+--------------------------+



There is a substantial positive correlation between C02 and energy consumption(in kWh).

In [0]:
# Rename the columns for better readability
cols_to_be_renamed = {
    "Lagging_Current_Reactive.Power_kVarh": "Lagging_Current_Reactive_Power_kVarh",
    "CO2(tCO2)": "CO2"
}

for key, val in cols_to_be_renamed.items():
    data = data.withColumnRenamed(key,val)

## Splitting the dataset into train and test sets

In [0]:
train, test = data.randomSplit([0.75,0.25],seed=64)

In [0]:
train.count(), test.count()

(26247, 8793)

## Creating an end-to-end feature engineering and machine learning pipeline

In [0]:
day_of_week_indexer = StringIndexer(inputCol='Day_of_week',outputCol='Day_of_week_index')
load_type_indexer = StringIndexer(inputCol='Load_Type',outputCol='Load_Type_index')
week_status_indexer = StringIndexer(inputCol='WeekStatus',outputCol='WeekStatus_index')
assembler = VectorAssembler(inputCols=["Lagging_Current_Reactive_Power_kVarh","Leading_Current_Reactive_Power_kVarh","CO2",
"Lagging_Current_Power_Factor","Leading_Current_Power_Factor","NSM","Day_of_week_index","Load_Type_index","WeekStatus_index"], outputCol="features", handleInvalid="skip")
scaler = StandardScaler(inputCol="features",outputCol="scaledFeatures")
lr = LinearRegression(featuresCol="scaledFeatures",labelCol="Usage_kWh")

In [0]:
pipeline = Pipeline(stages=[
    day_of_week_indexer,load_type_indexer,week_status_indexer,assembler,scaler,lr
])
pipeline

Pipeline_fc85af8c15e8

## Model Training & Evaluation

In [0]:
fitted_pipelines = []
model_names = []
r2_scores = []
rmse_scores = []
mae_scores = []
mse_scores = []
explained_variance_scores = []

In [0]:
def data_prep_ml_pipeline(model: Pipeline) -> None:
    model_names.append(str(model).split('(')[0])
    day_of_week_indexer = StringIndexer(inputCol='Day_of_week',outputCol='Day_of_week_index')
    load_type_indexer = StringIndexer(inputCol='Load_Type',outputCol='Load_Type_index')
    week_status_indexer = StringIndexer(inputCol='WeekStatus',outputCol='WeekStatus_index')
    assembler = VectorAssembler(inputCols=["Lagging_Current_Reactive_Power_kVarh","Leading_Current_Reactive_Power_kVarh","CO2",
    "Lagging_Current_Power_Factor","Leading_Current_Power_Factor","NSM","Day_of_week_index","Load_Type_index","WeekStatus_index"], outputCol="features", handleInvalid="skip")
    scaler = StandardScaler(inputCol="features",outputCol="scaledFeatures")
    pipeline = Pipeline(stages=[
        day_of_week_indexer,load_type_indexer,week_status_indexer,assembler,scaler,model
    ])
    model = pipeline.fit(train)
    predictions = model.transform(test)
    r2_eval = RegressionEvaluator(labelCol="Usage_kWh", predictionCol="prediction", metricName="r2")
    rmse_eval = RegressionEvaluator(labelCol="Usage_kWh", predictionCol="prediction", metricName="rmse")
    mae_eval = RegressionEvaluator(labelCol="Usage_kWh", predictionCol="prediction", metricName="mae")
    mse_eval = RegressionEvaluator(labelCol="Usage_kWh", predictionCol="prediction", metricName="mse")
    explained_var_eval = RegressionEvaluator(labelCol="Usage_kWh", predictionCol="prediction", metricName="var")
    r2 = r2_eval.evaluate(predictions)
    rmse = rmse_eval.evaluate(predictions)
    mae = mae_eval.evaluate(predictions)
    mse = mse_eval.evaluate(predictions)
    explained_var = explained_var_eval.evaluate(predictions)
    print("R2: %f" % r2)
    print("RMSE: %f" % rmse)
    print("MAE: %f" % mae)
    print("MSE: %f" % mse)
    print("Explained Variance: %f" % explained_var)
    fitted_pipelines.append(pipeline)
    r2_scores.append(r2)
    rmse_scores.append(rmse)
    mae_scores.append(mae)
    mse_scores.append(mse)
    explained_variance_scores.append(explained_var)

In [0]:
data_prep_ml_pipeline(LinearRegression(labelCol='Usage_kWh',featuresCol='scaledFeatures'))

R2: 0.978373
RMSE: 4.906425
MAE: 2.576966
MSE: 24.073003
Explained Variance: 1089.282180


In [0]:
data_prep_ml_pipeline(DecisionTreeRegressor(labelCol='Usage_kWh',featuresCol='scaledFeatures'))

R2: 0.987733
RMSE: 3.695226
MAE: 2.271536
MSE: 13.654693
Explained Variance: 1098.042644


In [0]:
data_prep_ml_pipeline(RandomForestRegressor(labelCol='Usage_kWh',featuresCol='scaledFeatures'))

R2: 0.977120
RMSE: 5.046593
MAE: 2.682973
MSE: 25.468103
Explained Variance: 1020.027518


In [0]:
data_prep_ml_pipeline(GBTRegressor(labelCol='Usage_kWh',featuresCol='scaledFeatures',predictionCol='prediction'))

R2: 0.992671
RMSE: 2.856312
MAE: 1.675349
MSE: 8.158519
Explained Variance: 1105.551382


In [0]:
data_prep_ml_pipeline(FMRegressor(labelCol='Usage_kWh',featuresCol='scaledFeatures'))

R2: 0.892570
RMSE: 10.935348
MAE: 8.589749
MSE: 119.581833
Explained Variance: 1300.018550


In [0]:
data_prep_ml_pipeline(GeneralizedLinearRegression(family='poisson',labelCol='Usage_kWh',featuresCol='scaledFeatures'))

R2: 0.944979
RMSE: 7.825857
MAE: 4.183788
MSE: 61.244035
Explained Variance: 1164.430605


In [0]:
data_prep_ml_pipeline(GeneralizedLinearRegression(family='tweedie',labelCol='Usage_kWh',featuresCol='scaledFeatures'))

R2: 0.978373
RMSE: 4.906425
MAE: 2.576966
MSE: 24.073003
Explained Variance: 1089.282180


In [0]:
data_prep_ml_pipeline(IsotonicRegression(labelCol='Usage_kWh',featuresCol='scaledFeatures'))

R2: 0.850908
RMSE: 12.882372
MAE: 8.059192
MSE: 165.955511
Explained Variance: 963.138828


## Baseline Models Performance Comparison

In [0]:
schema = StructType([
    StructField("Model", StringType(), True),
    StructField("R2", DoubleType(), True),
    StructField("MAE", DoubleType(), True),
    StructField("MSE", DoubleType(), True),
    StructField("RMSE", DoubleType(), True),
    StructField("Explained Variance", DoubleType(), True)
])

model_perfs = spark.createDataFrame([
    {'Model': model_names[i],
     'Pipeline': fitted_pipelines[i],
     'R2': r2_scores[i],
     'MAE': mae_scores[i],
     'MSE': mse_scores[i],
     'RMSE': rmse_scores[i],
     'Explained Variance': explained_variance_scores[i]}
    for i in range(len(fitted_pipelines))
], schema).orderBy('R2', ascending=False)

display(model_perfs)

Model,R2,MAE,MSE,RMSE,Explained Variance
GBTRegressor_7a4fda97ea1f,0.9926705167089124,1.6753489485535518,8.158519158796077,2.856312160600812,1105.5513824725174
DecisionTreeRegressor_af69e3515ef7,0.9877328416123976,2.271535957228031,13.654693346656387,3.695225750432088,1098.042644180742
GeneralizedLinearRegression_9354a937f47c,0.9783731987663332,2.576966078672984,24.07300285722692,4.906424651131098,1089.2821796010123
LinearRegression_f0b892982909,0.9783731987663332,2.576966078672984,24.07300285722692,4.906424651131098,1089.2821796010123
RandomForestRegressor_8a639155b428,0.9771198627076282,2.682973440584988,25.468103417697343,5.04659324868741,1020.0275176786478
GeneralizedLinearRegression_b9c0ea47a022,0.9449793374890748,4.183788207783968,61.2440347290067,7.825856804785448,1164.4306053175485
FMRegressor_7f60ad90a0d4,0.8925695909598338,8.589749364327886,119.5818334775757,10.935347890102795,1300.0185500435653
IsotonicRegression_e0b273559e70,0.850908220091087,8.059192051249958,165.95551070905063,12.882372091701535,963.1388280997936


The GBT regressor model has emerged as the best performing model having achieved a remarkable r2 score of more than 99% on the test set.

## Hyperparameter Tuning & Cross Validation

In [0]:
def tune_hyperparameters(model,param_grid):
    day_of_week_indexer = StringIndexer(inputCol='Day_of_week',outputCol='Day_of_week_index')
    load_type_indexer = StringIndexer(inputCol='Load_Type',outputCol='Load_Type_index')
    week_status_indexer = StringIndexer(inputCol='WeekStatus',outputCol='WeekStatus_index')
    assembler = VectorAssembler(inputCols=["Lagging_Current_Reactive_Power_kVarh","Leading_Current_Reactive_Power_kVarh","CO2",
    "Lagging_Current_Power_Factor","Leading_Current_Power_Factor","NSM","Day_of_week_index","Load_Type_index","WeekStatus_index"], outputCol="features", handleInvalid="skip")
    scaler = StandardScaler(inputCol="features",outputCol="scaledFeatures")
    pipeline = Pipeline(stages=[
        day_of_week_indexer,load_type_indexer,week_status_indexer,assembler,scaler,model
    ])
    train_val_split = TrainValidationSplit(estimator=pipeline,evaluator=RegressionEvaluator().setMetricName("r2").setLabelCol("Usage_kWh"),estimatorParamMaps=param_grid,trainRatio=0.8)
    optimized_model = train_val_split.fit(train)
    predictions = optimized_model.transform(test)
    r2 = RegressionEvaluator(metricName="r2",labelCol="Usage_kWh").evaluate(predictions)
    print("R2 score on test data = %g" % r2)
    rmse = RegressionEvaluator(metricName="rmse",labelCol="Usage_kWh").evaluate(predictions)
    print("RMSE on test data = %g" % rmse)
    mae = RegressionEvaluator(metricName="mae",labelCol="Usage_kWh").evaluate(predictions)
    print("MAE on test data = %g" % mae)
    mse = RegressionEvaluator(metricName="mse",labelCol="Usage_kWh").evaluate(predictions)
    print("MSE on test data = %g" % mse)
    explained_variance = RegressionEvaluator(metricName="var",labelCol="Usage_kWh").evaluate(predictions)
    print("Explained Variance on test data = %g" % explained_variance)
    fitted_pipelines.append(pipeline)
    r2_scores.append(r2)
    rmse_scores.append(rmse)
    mae_scores.append(mae)
    mse_scores.append(mse)
    explained_variance_scores.append(explained_variance)
    model_names.append(str(model).split("(")[0])

In [0]:
lr_param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.fitIntercept, [True, False]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

tune_hyperparameters(LinearRegression(labelCol='Usage_kWh',featuresCol='scaledFeatures'),lr_param_grid)

R2 score on test data = 0.978373
RMSE on test data = 4.90642
MAE on test data = 2.57697
MSE on test data = 24.073
Explained Variance on test data = 1089.28


In [0]:
dt = DecisionTreeRegressor(labelCol="Usage_kWh",featuresCol="scaledFeatures")
dt_param_grid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [2, 5, 10]) \
    .addGrid(dt.maxBins, [32, 64, 128]) \
    .build()

tune_hyperparameters(dt,dt_param_grid)

R2 score on test data = 0.994916
RMSE on test data = 2.37891
MAE on test data = 1.24336
MSE on test data = 5.65923
Explained Variance on test data = 1109.28


In [0]:
rf = RandomForestRegressor(labelCol="Usage_kWh",featuresCol="scaledFeatures")
rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 50]) \
    .addGrid(rf.featureSubsetStrategy, ['all','auto','onethird','sqrt','log2']) \
    .build()

tune_hyperparameters(rf,rf_param_grid)

R2 score on test data = 0.988212
RMSE on test data = 3.62238
MAE on test data = 2.2365
MSE on test data = 13.1216
Explained Variance on test data = 1097.88


In [0]:
fm = FMRegressor(labelCol="Usage_kWh",featuresCol="scaledFeatures")
fm_param_grid = ParamGridBuilder() \
    .addGrid(fm.stepSize, [0.001, 0.01, 0.1]) \
    .addGrid(fm.factorSize, [4, 8, 16]) \
    .build()

tune_hyperparameters(fm,fm_param_grid)

R2 score on test data = 0.981118
RMSE on test data = 4.58453
MAE on test data = 2.46729
MSE on test data = 21.0179
Explained Variance on test data = 1093


In [0]:
iso = IsotonicRegression(labelCol="Usage_kWh",featuresCol="scaledFeatures")
iso_param_grid = ParamGridBuilder() \
    .addGrid(iso.isotonic, [True, False]) \
    .build()

tune_hyperparameters(iso,iso_param_grid)

R2 score on test data = 0.850908
RMSE on test data = 12.8824
MAE on test data = 8.05919
MSE on test data = 165.956
Explained Variance on test data = 963.139


In [0]:
poisson = GeneralizedLinearRegression(family="poisson",labelCol="Usage_kWh",featuresCol="scaledFeatures")
poisson_param_grid = ParamGridBuilder() \
    .addGrid(poisson.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(poisson.fitIntercept, [True, False]) \
    .build()

tune_hyperparameters(poisson,poisson_param_grid)

R2 score on test data = 0.941045
RMSE on test data = 8.10086
MAE on test data = 4.28933
MSE on test data = 65.6239
Explained Variance on test data = 1162.66


In [0]:
tweedie = GeneralizedLinearRegression(family="tweedie",labelCol="Usage_kWh",featuresCol="scaledFeatures")
tweedie_param_grid = ParamGridBuilder() \
    .addGrid(tweedie.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(tweedie.fitIntercept, [True, False]) \
    .addGrid(tweedie.variancePower, [0.0, 1.0]) \
    .build()

tune_hyperparameters(tweedie,tweedie_param_grid)

R2 score on test data = 0.978385
RMSE on test data = 4.90513
MAE on test data = 2.5797
MSE on test data = 24.0603
Explained Variance on test data = 1088.85


In [0]:
gbt = GBTRegressor(labelCol="Usage_kWh",featuresCol="scaledFeatures")
gbt_param_grid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [5, 10]) \
    .addGrid(gbt.maxIter, [10, 20]) \
    .build()

tune_hyperparameters(gbt,gbt_param_grid)

R2 score on test data = 0.994676
RMSE on test data = 2.4343
MAE on test data = 1.27242
MSE on test data = 5.9258
Explained Variance on test data = 1107.46


## Optimized Models Performance Comparison

In [0]:
schema = StructType([
    StructField("Model", StringType(), True),
    StructField("R2", DoubleType(), True),
    StructField("MAE", DoubleType(), True),
    StructField("MSE", DoubleType(), True),
    StructField("RMSE", DoubleType(), True),
    StructField("Explained Variance", DoubleType(), True)
])

model_perfs = spark.createDataFrame([
    {'Model': model_names[i],
     'Pipeline': fitted_pipelines[i],
     'R2': r2_scores[i],
     'MAE': mae_scores[i],
     'MSE': mse_scores[i],
     'RMSE': rmse_scores[i],
     'Explained Variance': explained_variance_scores[i]}
    for i in range(len(fitted_pipelines))
], schema).orderBy('R2', ascending=False)

display(model_perfs)

Model,R2,MAE,MSE,RMSE,Explained Variance
DecisionTreeRegressor_e1e5c4a8091c,0.9949158417836778,1.2433593799294902,5.659225973630857,2.37891277133712,1109.2840043115586
GBTRegressor_87604b67930b,0.9946763555968908,1.2724169114831372,5.925800378070141,2.434296690641907,1107.461385343401
GBTRegressor_7a4fda97ea1f,0.9926705167089124,1.6753489485535518,8.158519158796077,2.856312160600812,1105.5513824725174
RandomForestRegressor_1d10587cb71d,0.9882117388691766,2.2365042143052407,13.12162815101312,3.622378797284061,1097.8848322916235
DecisionTreeRegressor_af69e3515ef7,0.9877328416123976,2.271535957228031,13.654693346656387,3.695225750432088,1098.042644180742
FMRegressor_5cd738db193e,0.9811178155972472,2.467293530756744,21.017943160755063,4.584533036281346,1093.0004290335037
GeneralizedLinearRegression_3137d0545e70,0.9783846055391034,2.579695540472431,24.060305867481443,4.905130565793478,1088.8479419609746
GeneralizedLinearRegression_9354a937f47c,0.9783731987663332,2.576966078672984,24.07300285722692,4.906424651131098,1089.2821796010123
LinearRegression_f0b892982909,0.9783731987663332,2.576966078672984,24.07300285722692,4.906424651131098,1089.2821796010123
LinearRegression_3497948a239d,0.9783731987663332,2.576966078672984,24.07300285722692,4.906424651131098,1089.2821796010123


So, finally, after completing hyperparameter tuning, the Decision Tree Regressor has turned out to be the best performing model as it has achieved an incredible r2 score of almost 99.5% on the test set.

In [0]:
bestModel, bestModelPipeline = model_names[9], fitted_pipelines[9]

## Cross validating the performance of the best performing model

In [0]:
dt_evaluator = RegressionEvaluator(labelCol="Usage_kWh",metricName="r2")

dt_cv = CrossValidator(estimator=bestModelPipeline,
                       evaluator=dt_evaluator,
                       estimatorParamMaps=dt_param_grid,
                       numFolds=3)

dt_cv_model = dt_cv.fit(train)
predictions = dt_cv_model.transform(test)
print("Cross validation R2 score:", dt_evaluator.evaluate(predictions))

Cross validation R2 score: 0.9953278843713657


## Saving the best performing model for deployment

In [0]:
bestModelPipeline.save("file:/Workspace/Users/n01606417@humber.ca/steel_energy_prediction_pipeline")