In [0]:
# Checking for the FaultDataset CSV After Import

dbutils.fs.ls("/FileStore/tables/FaultDataset.csv")

Out[1]: [FileInfo(path='dbfs:/FileStore/tables/FaultDataset.csv', name='FaultDataset.csv', size=1703184, modificationTime=1679908939000)]

In [0]:
# Importing MLFlow & Enabling Auto-Log for Machine Learning Experiments

import mlflow

mlflow.pyspark.ml.autolog()

In [0]:
# Importing Required Libraries

from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.feature import RFormula
from pyspark.sql.functions import *

In [0]:
# Importing the Dataset as a Dataframe

df = spark.read.csv("/FileStore/tables/FaultDataset.csv",
                        header = "true",
                        inferSchema = "true")

In [0]:
# Creating a Boxplot to Identify the Distribution

df.display()

1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,fault_detected
0.3503125,0.3496875,0.35,0.3459375,0.3475,0.3459375,0.341875,0.3434375,0.355,0.3553125,0.3459375,0.3525,0.3575,0.3590625,0.35875,0.3484375,0.3590625,0.35,0.3559375,0.3490625,0
0.5090625,0.484375,0.046875,0.071875,0.06,0.0634375,0.0575,0.0546875,0.0559375,0.058125,0.0628125,0.065625,0.0640625,0.0634375,0.0534375,0.084375,0.0615625,0.05375,0.076875,0.056875,0
0.0928125,0.0975,0.1096875,0.1025,0.09625,0.1053125,0.09875,0.098125,0.091875,0.0909375,0.09875,0.103125,0.1,0.1034375,0.1015625,0.0978125,0.0990625,0.10375,0.098125,0.1040625,0
0.09375,0.089375,0.091875,0.0996875,0.0909375,0.096875,0.0940625,0.096875,0.096875,0.099375,0.099375,0.0959375,0.0959375,0.0940625,0.09125,0.0996875,0.09375,0.0934375,0.0971875,0.094375,0
0.036875,0.0440625,0.038125,0.0428125,0.0353125,0.0340625,0.033125,0.0403125,0.0346875,0.036875,0.035625,0.03625,0.0409375,0.039375,0.035,0.040625,0.0384375,0.036875,0.04,0.0371875,0
0.135625,0.3034375,0.13875,0.140625,0.126875,0.130625,0.139375,0.143125,0.1290625,0.140625,0.1340625,0.1396875,0.1384375,0.1453125,0.1453125,0.1496875,0.1440625,0.1359375,0.1453125,0.14625,0
0.3446875,0.35125,0.3353125,0.3471875,0.34625,0.348125,0.3478125,0.3521875,0.3525,0.35125,0.3571875,0.360625,0.3640625,0.36625,0.3640625,0.3634375,0.3475,0.35375,0.1575,0.351875,0
0.036875,0.035625,0.03125,0.0375,0.0390625,0.034375,0.0315625,0.031875,0.0378125,0.0321875,0.0371875,0.038125,0.035,0.0353125,0.0325,0.03,0.0325,0.0321875,0.0321875,0.03125,0
0.0371875,0.039375,0.033125,0.04,0.04125,0.03875,0.035625,0.0384375,0.0378125,0.0365625,0.033125,0.0365625,0.03375,0.034375,0.0346875,0.04125,0.0365625,0.035,0.034375,0.0396875,0
0.3590625,0.3609375,0.360625,0.3590625,0.355,0.365,0.355625,0.358125,0.3575,0.3578125,0.355625,0.3584375,0.3521875,0.3459375,0.3521875,0.3509375,0.3525,0.35625,0.353125,0.3540625,0


Output can only be rendered in Databricks

In [0]:
# Summary Exploration of Data Using Describe

display(df.describe())

summary,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,fault_detected
count,9292.0,9292.0,9292.0,9292.0,9292.0,9292.0,9292.0,9292.0,9292.0,9292.0,9292.0,9292.0,9292.0,9292.0,9292.0,9292.0,9292.0,9292.0,9292.0,9292.0,9292.0
mean,0.3416233049935422,0.3426311612139467,0.3421213812957383,0.3421390712440796,0.3428434405940584,0.3428279366659492,0.3427147008179088,0.3430657756672401,0.3431729915518722,0.3439251842983209,0.3441075320167876,0.3439842404756772,0.3441183275936292,0.3448787599547991,0.3448999475355137,0.3458342525828662,0.3456348875376668,0.3457525290572536,0.3465221427034015,0.346670724817048,0.5
stddev,0.2891948948626078,0.2890875372793958,0.2891642249061693,0.2891635633310729,0.2889646554403878,0.2890889899729543,0.2891948159883224,0.2891918560806545,0.2893401858067147,0.289011538534877,0.2892001448749587,0.2890708129465896,0.289118047014631,0.2889821392646809,0.2891314011350137,0.2888285654988746,0.2889204033670731,0.2891502814843134,0.2887705775702368,0.2890013554393105,0.5000269070362092
min,0.024375,0.024375,0.024375,0.024375,0.024375,0.024375,0.024375,0.024375,0.024375,0.024375,0.024375,0.024375,0.025,0.025,0.025,0.024375,0.024375,0.024375,0.024375,0.025,0.0
max,1.0809375,1.2134375,1.0809375,1.0809375,1.0809375,1.0809375,1.0809375,1.0809375,1.0809375,1.0809375,1.0809375,1.2134375,1.0809375,1.2134375,1.0809375,1.0809375,1.0809375,1.0809375,1.0809375,1.0809375,1.0


In [0]:
# Checking for Missing Vallues

df.select([count(when(isnull(a), a)).alias(a) for a in df.columns]).show()

+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+--------------+
|  1|  2|  3|  4|  5|  6|  7|  8|  9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19| 20|fault_detected|
+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+--------------+
|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|             0|
+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+--------------+



In [0]:
# Identifying Frequency Count for Fault or No Fault

class_counts = df.groupBy("fault_detected").count()
class_counts.display()

fault_detected,count
1,4646
0,4646


In [0]:
# Identifying the Data Types

df.printSchema()

root
 |-- 1: double (nullable = true)
 |-- 2: double (nullable = true)
 |-- 3: double (nullable = true)
 |-- 4: double (nullable = true)
 |-- 5: double (nullable = true)
 |-- 6: double (nullable = true)
 |-- 7: double (nullable = true)
 |-- 8: double (nullable = true)
 |-- 9: double (nullable = true)
 |-- 10: double (nullable = true)
 |-- 11: double (nullable = true)
 |-- 12: double (nullable = true)
 |-- 13: double (nullable = true)
 |-- 14: double (nullable = true)
 |-- 15: double (nullable = true)
 |-- 16: double (nullable = true)
 |-- 17: double (nullable = true)
 |-- 18: double (nullable = true)
 |-- 19: double (nullable = true)
 |-- 20: double (nullable = true)
 |-- fault_detected: integer (nullable = true)



In [0]:
# Utilizing RFormula to Pre-Process & Transform the Dataframe

preprocess = RFormula(formula = "fault_detected ~ .")
df = preprocess.fit(df).transform(df)

df.show(5)

2023/03/27 09:57:18 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '768c251cd5ff4e80b57c817fe80d064e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow
+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+--------------+--------------------+-----+
|        1|        2|        3|        4|        5|        6|        7|        8|        9|       10|       11|       12|       13|       14|       15|       16|       17|       18|       19|       20|fault_detected|            features|label|
+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+--------------+--------------------+-----+
|0.3

In [0]:
# Splitting the Data Into Training & Test Sets

(trainingData, testData) = df.randomSplit([0.7, 0.3], seed = 42)

In [0]:
# Creating the Evaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [0]:
# Creating & Training the Decision Tree Classifier

# Base Version - Obtained Standard: 95.67 / runs:/a518bde4641e43c59494a06b53a9dd34/model
# Parameters - Impurity: Gini, MaxBins: 32, MaxDepth: 5

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

model = dt.fit(trainingData)

2023/03/27 09:57:21 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a518bde4641e43c59494a06b53a9dd34', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow
2023/03/27 09:57:36 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


In [0]:
# Creating the Predictions on the Test Data

predictions = model.transform(testData)

predictions.show()

+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+--------------+--------------------+-----+--------------+--------------------+----------+
|        1|        2|        3|        4|        5|        6|        7|        8|        9|       10|       11|       12|       13|       14|       15|       16|       17|       18|       19|       20|fault_detected|            features|label| rawPrediction|         probability|prediction|
+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+--------------+--------------------+-----+--------------+--------------------+----------+
|0.0253125|0.0259375|0.0284375|0.0296875|0.0340625|0.0328125|0.0328125|   0.0325|    0.025|0.0334375| 0.035625|  0.02875|0.0334

In [0]:
# Evaluating the Accuracy of Predictions on Test Data

accuracy = evaluator.evaluate(predictions)

print("Accuracy: {:.2f}".format(accuracy * 100))

Accuracy: 95.67


In [0]:
# Creating a Paramater Grid for Hyperparameter Tuning

# Version 1 - Obtained Standard: 95.67 / Best: 95.89 = 0.22 Higher Accuracy / runs:/c6f79eabd01b412ebeecb0da3acf1ba7/best_model
# Parameters - Impurity: Entropy, MaxBins: 40, MaxDepth: 10

# parameters = ParamGridBuilder()\
# .addGrid(dt.impurity,["gini", "entropy"])\
# .addGrid(dt.maxBins, [24,32,40])\
# .addGrid(dt.maxDepth, [5,10,15])\
# .build()

# Version 2 - Obtained Standard: 95.67 / Best: 96.15 = 0.48 Higher Accuracy / runs:/3ae81dae2c4b40619cbfd0fe6c1b9005/best_model
# Parameters - Impurity: Gini, MaxBins: 30, MaxDepth: 8 

# parameters = ParamGridBuilder()\
# .addGrid(dt.impurity,["gini", "entropy"])\
# .addGrid(dt.maxBins, [5,10,15,20,30])\
# .addGrid(dt.maxDepth, [2,4,6,8,10])\
# .build()

# Version 3 - Obtained Standard: 95.67 / Best: 96.67 = 1.00 Higher Accuracy / runs:/8287f0625aaa46e3b8c7b815e5a05cb0/best_model
# Parameters - Impurity: Entropy, MaxBins: 50, MaxDepth: 14 

parameters = ParamGridBuilder()\
.addGrid(dt.impurity,["gini", "entropy"])\
.addGrid(dt.maxBins, [40,45,50])\
.addGrid(dt.maxDepth, [10,12,14])\
.build()

In [0]:
# Creating a TVS Instance to Perform Hyperparameter Tuning for Decision Tree

tvs = TrainValidationSplit()\
.setSeed(100)\
.setTrainRatio(0.7)\
.setEstimatorParamMaps(parameters)\
.setEstimator(dt)\
.setEvaluator(evaluator)

In [0]:
# Training the Model by Grid Search

gridsearchModel = tvs.fit(trainingData)

2023/03/27 09:58:32 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '8287f0625aaa46e3b8c7b815e5a05cb0', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow
2023/03/27 09:59:43 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().
2023/03/27 10:00:50 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


In [0]:
# Identifying the Optimal Paramaters

bestModel = gridsearchModel.bestModel

print("Parameters for the Best Model:")
print("Impurity Parameter: %s" %bestModel.getImpurity()) 
print("MaxBins Parameter: %g" %bestModel.getMaxBins()) 
print("MaxDepth Parameter: %g" %bestModel.getMaxDepth()) 

Parameters for the Best Model:
Impurity Parameter: entropy
MaxBins Parameter: 50
MaxDepth Parameter: 14


In [0]:
# Identifying the Evaluation Accuracy of Best ML Model

output = evaluator.evaluate(bestModel.transform(testData))
print("Accuracy: {:.2f}".format(output * 100))

Accuracy: 96.67


In [0]:
# Loading the Best ML Model to Create Predictions on the Test Data

logged_model = 'runs:/8287f0625aaa46e3b8c7b815e5a05cb0/best_model'

loaded_model = mlflow.spark.load_model(logged_model)

loaded_predictions = loaded_model.transform(testData)

loaded_predictions.show()

2023/03/27 10:01:43 INFO mlflow.spark: 'runs:/43a3c62caabe4265b3071d6029fb01f8/best_model' resolved as 'dbfs:/databricks/mlflow-tracking/2739139740925745/43a3c62caabe4265b3071d6029fb01f8/artifacts/best_model'
+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+--------------+--------------------+-----+-------------+-----------+----------+
|        1|        2|        3|        4|        5|        6|        7|        8|        9|       10|       11|       12|       13|       14|       15|       16|       17|       18|       19|       20|fault_detected|            features|label|rawPrediction|probability|prediction|
+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+--------------+-------------

In [0]:
# Final Prediction Output for Decision Tree

loaded_predictions.display()

1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,fault_detected,features,label,rawPrediction,probability,prediction
0.0253125,0.0259375,0.0284375,0.0296875,0.0340625,0.0328125,0.0328125,0.0325,0.025,0.0334375,0.035625,0.02875,0.0334375,0.03,0.030625,0.026875,0.02875,0.0359375,0.035,0.03875,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0253125, 0.0259375, 0.0284375, 0.0296875, 0.0340625, 0.0328125, 0.0328125, 0.0325, 0.025, 0.0334375, 0.035625, 0.02875, 0.0334375, 0.03, 0.030625, 0.026875, 0.02875, 0.0359375, 0.035, 0.03875))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(0.0, 7.0))","Map(vectorType -> dense, length -> 2, values -> List(0.0, 1.0))",1.0
0.0259375,0.0284375,0.0296875,0.0340625,0.0328125,0.0328125,0.0325,0.025,0.0334375,0.035625,0.02875,0.0334375,0.03,0.030625,0.026875,0.02875,0.0359375,0.035,0.03875,0.034375,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0259375, 0.0284375, 0.0296875, 0.0340625, 0.0328125, 0.0328125, 0.0325, 0.025, 0.0334375, 0.035625, 0.02875, 0.0334375, 0.03, 0.030625, 0.026875, 0.02875, 0.0359375, 0.035, 0.03875, 0.034375))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(6.0, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(1.0, 0.0))",0.0
0.0259375,0.0328125,0.0340625,0.03375,0.0309375,0.0309375,0.034375,0.030625,0.029375,0.029375,0.026875,0.030625,0.0284375,0.03125,0.0271875,0.0278125,0.0290625,0.0278125,0.0309375,0.0290625,0,"Map(vectorType -> dense, length -> 20, values -> List(0.0259375, 0.0328125, 0.0340625, 0.03375, 0.0309375, 0.0309375, 0.034375, 0.030625, 0.029375, 0.029375, 0.026875, 0.030625, 0.0284375, 0.03125, 0.0271875, 0.0278125, 0.0290625, 0.0278125, 0.0309375, 0.0290625))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(48.0, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(1.0, 0.0))",0.0
0.02625,0.025625,0.0290625,0.0265625,0.0353125,0.0259375,0.0328125,0.0340625,0.03375,0.0309375,0.0309375,0.034375,0.030625,0.029375,0.029375,0.026875,0.030625,0.0284375,0.03125,0.0271875,0,"Map(vectorType -> dense, length -> 20, values -> List(0.02625, 0.025625, 0.0290625, 0.0265625, 0.0353125, 0.0259375, 0.0328125, 0.0340625, 0.03375, 0.0309375, 0.0309375, 0.034375, 0.030625, 0.029375, 0.029375, 0.026875, 0.030625, 0.0284375, 0.03125, 0.0271875))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.0, 8.0))","Map(vectorType -> dense, length -> 2, values -> List(0.0, 1.0))",1.0
0.0265625,0.0253125,0.0259375,0.0284375,0.0296875,0.0340625,0.0328125,0.0328125,0.0325,0.025,0.0334375,0.035625,0.02875,0.0334375,0.03,0.030625,0.026875,0.02875,0.0359375,0.035,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0265625, 0.0253125, 0.0259375, 0.0284375, 0.0296875, 0.0340625, 0.0328125, 0.0328125, 0.0325, 0.025, 0.0334375, 0.035625, 0.02875, 0.0334375, 0.03, 0.030625, 0.026875, 0.02875, 0.0359375, 0.035))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(0.0, 8.0))","Map(vectorType -> dense, length -> 2, values -> List(0.0, 1.0))",1.0
0.0265625,0.0284375,0.02625,0.0325,0.0296875,0.0353125,0.0328125,0.0265625,0.03125,0.0328125,0.029375,0.0284375,0.0334375,0.0321875,0.0296875,0.0325,0.0284375,0.0259375,0.03125,0.02875,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0265625, 0.0284375, 0.02625, 0.0325, 0.0296875, 0.0353125, 0.0328125, 0.0265625, 0.03125, 0.0328125, 0.029375, 0.0284375, 0.0334375, 0.0321875, 0.0296875, 0.0325, 0.0284375, 0.0259375, 0.03125, 0.02875))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(0.0, 7.0))","Map(vectorType -> dense, length -> 2, values -> List(0.0, 1.0))",1.0
0.0265625,0.02875,0.0321875,0.0290625,0.0271875,0.0265625,0.0296875,0.0278125,0.0284375,0.0271875,0.0284375,0.0309375,0.0309375,0.02875,0.0340625,0.0296875,0.03125,0.0359375,0.671875,0.0640625,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0265625, 0.02875, 0.0321875, 0.0290625, 0.0271875, 0.0265625, 0.0296875, 0.0278125, 0.0284375, 0.0271875, 0.0284375, 0.0309375, 0.0309375, 0.02875, 0.0340625, 0.0296875, 0.03125, 0.0359375, 0.671875, 0.0640625))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(0.0, 12.0))","Map(vectorType -> dense, length -> 2, values -> List(0.0, 1.0))",1.0
0.0265625,0.0315625,0.0296875,0.0321875,0.0275,0.0315625,0.03625,0.03375,0.03375,0.0271875,0.030625,0.0334375,0.029375,0.02625,0.0353125,0.0346875,0.036875,0.0365625,0.03,0.0309375,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0265625, 0.0315625, 0.0296875, 0.0321875, 0.0275, 0.0315625, 0.03625, 0.03375, 0.03375, 0.0271875, 0.030625, 0.0334375, 0.029375, 0.02625, 0.0353125, 0.0346875, 0.036875, 0.0365625, 0.03, 0.0309375))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(0.0, 7.0))","Map(vectorType -> dense, length -> 2, values -> List(0.0, 1.0))",1.0
0.026875,0.02875,0.0359375,0.035,0.03875,0.034375,0.9184375,0.873125,0.8878125,0.8090625,0.033125,0.7809375,0.77875,0.0578125,0.80625,0.038125,0.7928125,0.0390625,0.775,0.82375,1,"Map(vectorType -> dense, length -> 20, values -> List(0.026875, 0.02875, 0.0359375, 0.035, 0.03875, 0.034375, 0.9184375, 0.873125, 0.8878125, 0.8090625, 0.033125, 0.7809375, 0.77875, 0.0578125, 0.80625, 0.038125, 0.7928125, 0.0390625, 0.775, 0.82375))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(0.0, 129.0))","Map(vectorType -> dense, length -> 2, values -> List(0.0, 1.0))",1.0
0.026875,0.0346875,0.03125,0.0303125,0.0315625,0.02875,0.0278125,0.034375,0.0325,0.0375,0.030625,0.03625,0.0334375,0.0346875,0.0284375,0.0290625,0.0296875,0.031875,0.0315625,0.0315625,0,"Map(vectorType -> dense, length -> 20, values -> List(0.026875, 0.0346875, 0.03125, 0.0303125, 0.0315625, 0.02875, 0.0278125, 0.034375, 0.0325, 0.0375, 0.030625, 0.03625, 0.0334375, 0.0346875, 0.0284375, 0.0290625, 0.0296875, 0.031875, 0.0315625, 0.0315625))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.0, 2.0))","Map(vectorType -> dense, length -> 2, values -> List(0.0, 1.0))",1.0


In [0]:
# Creating & Training the Random Forest Classifier 

# Base Version - Obtained Standard: 96.86/ runs:/6f338ef88a784e1aa0778050077a3afd/model
# Parameters - Impurity: Gini, MaxBins: 32, MaxDepth: 5

rf = RandomForestClassifier(labelCol="label", featuresCol="features")

rf_model = rf.fit(trainingData)

2023/03/27 10:01:55 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6f338ef88a784e1aa0778050077a3afd', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow
2023/03/27 10:02:06 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


In [0]:
# Creating the Predictions on the Test Data

rf_predictions = rf_model.transform(testData)

rf_predictions.show()

+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+--------------+--------------------+-----+--------------------+--------------------+----------+
|        1|        2|        3|        4|        5|        6|        7|        8|        9|       10|       11|       12|       13|       14|       15|       16|       17|       18|       19|       20|fault_detected|            features|label|       rawPrediction|         probability|prediction|
+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+--------------+--------------------+-----+--------------------+--------------------+----------+
|0.0253125|0.0259375|0.0284375|0.0296875|0.0340625|0.0328125|0.0328125|   0.0325|    0.025|0.0334375| 0.03562

In [0]:
# Evaluating the Accuracy of Predictions on Test Data

rf_accuracy = evaluator.evaluate(rf_predictions)

print("Accuracy: {:.2f}".format(rf_accuracy * 100))

Accuracy: 96.86


In [0]:
# Displaying the Predictions for Random Forest

rf_predictions.display()

1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,fault_detected,features,label,rawPrediction,probability,prediction
0.0253125,0.0259375,0.0284375,0.0296875,0.0340625,0.0328125,0.0328125,0.0325,0.025,0.0334375,0.035625,0.02875,0.0334375,0.03,0.030625,0.026875,0.02875,0.0359375,0.035,0.03875,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0253125, 0.0259375, 0.0284375, 0.0296875, 0.0340625, 0.0328125, 0.0328125, 0.0325, 0.025, 0.0334375, 0.035625, 0.02875, 0.0334375, 0.03, 0.030625, 0.026875, 0.02875, 0.0359375, 0.035, 0.03875))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(18.918536580289647, 1.0814634197103543))","Map(vectorType -> dense, length -> 2, values -> List(0.9459268290144823, 0.05407317098551771))",0.0
0.0259375,0.0284375,0.0296875,0.0340625,0.0328125,0.0328125,0.0325,0.025,0.0334375,0.035625,0.02875,0.0334375,0.03,0.030625,0.026875,0.02875,0.0359375,0.035,0.03875,0.034375,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0259375, 0.0284375, 0.0296875, 0.0340625, 0.0328125, 0.0328125, 0.0325, 0.025, 0.0334375, 0.035625, 0.02875, 0.0334375, 0.03, 0.030625, 0.026875, 0.02875, 0.0359375, 0.035, 0.03875, 0.034375))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(18.918536580289647, 1.0814634197103543))","Map(vectorType -> dense, length -> 2, values -> List(0.9459268290144823, 0.05407317098551771))",0.0
0.0259375,0.0328125,0.0340625,0.03375,0.0309375,0.0309375,0.034375,0.030625,0.029375,0.029375,0.026875,0.030625,0.0284375,0.03125,0.0271875,0.0278125,0.0290625,0.0278125,0.0309375,0.0290625,0,"Map(vectorType -> dense, length -> 20, values -> List(0.0259375, 0.0328125, 0.0340625, 0.03375, 0.0309375, 0.0309375, 0.034375, 0.030625, 0.029375, 0.029375, 0.026875, 0.030625, 0.0284375, 0.03125, 0.0271875, 0.0278125, 0.0290625, 0.0278125, 0.0309375, 0.0290625))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(18.918536580289647, 1.0814634197103543))","Map(vectorType -> dense, length -> 2, values -> List(0.9459268290144823, 0.05407317098551771))",0.0
0.02625,0.025625,0.0290625,0.0265625,0.0353125,0.0259375,0.0328125,0.0340625,0.03375,0.0309375,0.0309375,0.034375,0.030625,0.029375,0.029375,0.026875,0.030625,0.0284375,0.03125,0.0271875,0,"Map(vectorType -> dense, length -> 20, values -> List(0.02625, 0.025625, 0.0290625, 0.0265625, 0.0353125, 0.0259375, 0.0328125, 0.0340625, 0.03375, 0.0309375, 0.0309375, 0.034375, 0.030625, 0.029375, 0.029375, 0.026875, 0.030625, 0.0284375, 0.03125, 0.0271875))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(18.918536580289647, 1.0814634197103543))","Map(vectorType -> dense, length -> 2, values -> List(0.9459268290144823, 0.05407317098551771))",0.0
0.0265625,0.0253125,0.0259375,0.0284375,0.0296875,0.0340625,0.0328125,0.0328125,0.0325,0.025,0.0334375,0.035625,0.02875,0.0334375,0.03,0.030625,0.026875,0.02875,0.0359375,0.035,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0265625, 0.0253125, 0.0259375, 0.0284375, 0.0296875, 0.0340625, 0.0328125, 0.0328125, 0.0325, 0.025, 0.0334375, 0.035625, 0.02875, 0.0334375, 0.03, 0.030625, 0.026875, 0.02875, 0.0359375, 0.035))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(18.918536580289647, 1.0814634197103543))","Map(vectorType -> dense, length -> 2, values -> List(0.9459268290144823, 0.05407317098551771))",0.0
0.0265625,0.0284375,0.02625,0.0325,0.0296875,0.0353125,0.0328125,0.0265625,0.03125,0.0328125,0.029375,0.0284375,0.0334375,0.0321875,0.0296875,0.0325,0.0284375,0.0259375,0.03125,0.02875,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0265625, 0.0284375, 0.02625, 0.0325, 0.0296875, 0.0353125, 0.0328125, 0.0265625, 0.03125, 0.0328125, 0.029375, 0.0284375, 0.0334375, 0.0321875, 0.0296875, 0.0325, 0.0284375, 0.0259375, 0.03125, 0.02875))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(18.918536580289647, 1.0814634197103543))","Map(vectorType -> dense, length -> 2, values -> List(0.9459268290144823, 0.05407317098551771))",0.0
0.0265625,0.02875,0.0321875,0.0290625,0.0271875,0.0265625,0.0296875,0.0278125,0.0284375,0.0271875,0.0284375,0.0309375,0.0309375,0.02875,0.0340625,0.0296875,0.03125,0.0359375,0.671875,0.0640625,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0265625, 0.02875, 0.0321875, 0.0290625, 0.0271875, 0.0265625, 0.0296875, 0.0278125, 0.0284375, 0.0271875, 0.0284375, 0.0309375, 0.0309375, 0.02875, 0.0340625, 0.0296875, 0.03125, 0.0359375, 0.671875, 0.0640625))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(14.315357669048606, 5.684642330951393))","Map(vectorType -> dense, length -> 2, values -> List(0.7157678834524303, 0.28423211654756964))",0.0
0.0265625,0.0315625,0.0296875,0.0321875,0.0275,0.0315625,0.03625,0.03375,0.03375,0.0271875,0.030625,0.0334375,0.029375,0.02625,0.0353125,0.0346875,0.036875,0.0365625,0.03,0.0309375,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0265625, 0.0315625, 0.0296875, 0.0321875, 0.0275, 0.0315625, 0.03625, 0.03375, 0.03375, 0.0271875, 0.030625, 0.0334375, 0.029375, 0.02625, 0.0353125, 0.0346875, 0.036875, 0.0365625, 0.03, 0.0309375))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(18.918536580289647, 1.0814634197103543))","Map(vectorType -> dense, length -> 2, values -> List(0.9459268290144823, 0.05407317098551771))",0.0
0.026875,0.02875,0.0359375,0.035,0.03875,0.034375,0.9184375,0.873125,0.8878125,0.8090625,0.033125,0.7809375,0.77875,0.0578125,0.80625,0.038125,0.7928125,0.0390625,0.775,0.82375,1,"Map(vectorType -> dense, length -> 20, values -> List(0.026875, 0.02875, 0.0359375, 0.035, 0.03875, 0.034375, 0.9184375, 0.873125, 0.8878125, 0.8090625, 0.033125, 0.7809375, 0.77875, 0.0578125, 0.80625, 0.038125, 0.7928125, 0.0390625, 0.775, 0.82375))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(2.1727927253547223, 17.82720727464528))","Map(vectorType -> dense, length -> 2, values -> List(0.10863963626773612, 0.8913603637322639))",1.0
0.026875,0.0346875,0.03125,0.0303125,0.0315625,0.02875,0.0278125,0.034375,0.0325,0.0375,0.030625,0.03625,0.0334375,0.0346875,0.0284375,0.0290625,0.0296875,0.031875,0.0315625,0.0315625,0,"Map(vectorType -> dense, length -> 20, values -> List(0.026875, 0.0346875, 0.03125, 0.0303125, 0.0315625, 0.02875, 0.0278125, 0.034375, 0.0325, 0.0375, 0.030625, 0.03625, 0.0334375, 0.0346875, 0.0284375, 0.0290625, 0.0296875, 0.031875, 0.0315625, 0.0315625))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(18.918536580289647, 1.0814634197103543))","Map(vectorType -> dense, length -> 2, values -> List(0.9459268290144823, 0.05407317098551771))",0.0


In [0]:
# Creating a Paramater Grid for Hyperparameter Tuning

# Version 1 - Obtained Standard: 96.86 / Best: 97.60 = 0.74 Higher Accuracy / runs:/e3f88bb187ee46229c413e2337aa01c6/best_model / Duration: 13 Minutes
# Parameters - Impurity: Entropy, MaxBins: 35, MaxDepth: 12, NumTrees: 30

# rf_parameters = (ParamGridBuilder()\
# .addGrid(rf.impurity,["gini", "entropy"])\
# .addGrid(rf.maxBins, [20, 30, 35, 40])\
# .addGrid(rf.maxDepth, [6, 8, 10, 12])\
# .addGrid(rf.numTrees, [20, 30, 35, 40])\
# .build())

# Version 2 - Obtained Standard: 96.86 / Best: 97.93 = 1.07 Higher Accuracy / runs:/4eb623aff73b4d16a4cccd160682307a/best_model / Duration: 23 Minutes
# Parameters - Impurity: Entropy, MaxBins: 55, MaxDepth: 20, NumTrees: 70

rf_parameters = (ParamGridBuilder()\
.addGrid(rf.impurity,["gini", "entropy"])\
.addGrid(rf.maxBins, [25, 40, 55])\
.addGrid(rf.maxDepth, [20, 25, 30])\
.addGrid(rf.numTrees, [50, 70, 90])\
.build())

# Version 3 - Obtained Standard: 96.86 / Best: 97.56 = 0.70 Higher Accuracy / runs:/a64d433a38a74bf28974b73492832c20/best_model / Duration: 33 Minutes
# Parameters - Impurity: Entropy, MaxBins: 28, MaxDepth: 15, NumTrees: 30, Subsampling: 0.9, Subset: sqrt

# rf_parameters = (ParamGridBuilder()\
# .addGrid(rf.impurity,["gini", "entropy"])\
# .addGrid(rf.maxBins, [28, 32, 36])\
# .addGrid(rf.maxDepth, [5, 10, 15])\
# .addGrid(rf.numTrees, [20, 30, 40])\
# .addGrid(rf.subsamplingRate, [0.7, 0.8, 0.9])\
# .addGrid(rf.featureSubsetStrategy, ["sqrt", "log2"])\
# .build())

In [0]:
# Creating a TVS Instance to Perform Hyperparameter Tuning for Random Forest

rf_tvs = TrainValidationSplit()\
.setSeed(100)\
.setTrainRatio(0.7)\
.setEstimatorParamMaps(rf_parameters)\
.setEstimator(rf)\
.setEvaluator(evaluator)

In [0]:
# Training the Model by Grid Search

rf_gridsearchModel = rf_tvs.fit(trainingData)

2023/03/27 10:03:06 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4eb623aff73b4d16a4cccd160682307a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow
2023/03/27 10:14:00 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().
2023/03/27 10:15:24 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


In [0]:
# Identifying the Optimal Paramaters

rf_bestModel = rf_gridsearchModel.bestModel

print("Parameters for the Best Model:")
print("Impurity Parameter: %s" %rf_bestModel.getImpurity()) 
print("MaxBins Parameter: %g" %rf_bestModel.getMaxBins()) 
print("MaxDepth Parameter: %g" %rf_bestModel.getMaxDepth()) 
print("NumTrees Parameter: %g" %rf_bestModel.getNumTrees)
# print("Subsampling Rate Parameter: %g" %rf_bestModel.getSubsamplingRate())
# print("Feature Subset Strategy Parameter: %s" %rf_bestModel.getFeatureSubsetStrategy())

Parameters for the Best Model:
Impurity Parameter: entropy
MaxBins Parameter: 55
MaxDepth Parameter: 20
NumTrees Parameter: 70


In [0]:
# Identifying the Evaluation Accuracy of Best ML Model

rf_output = evaluator.evaluate(rf_bestModel.transform(testData))
print("Accuracy: {:.2f}".format(rf_output * 100))

Accuracy: 97.93


In [0]:
# Loading the Best ML Model to Create Predictions on the Test Data

rf_logged_model = 'runs:/4eb623aff73b4d16a4cccd160682307a/best_model'

rf_loaded_model = mlflow.spark.load_model(rf_logged_model)

rf_loaded_predictions = rf_loaded_model.transform(testData)

rf_loaded_predictions.show()

2023/03/27 10:16:41 INFO mlflow.spark: 'runs:/aa08681a7c894cd48c608e8ac680b4e1/best_model' resolved as 'dbfs:/databricks/mlflow-tracking/2739139740925745/aa08681a7c894cd48c608e8ac680b4e1/artifacts/best_model'
+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+--------------+--------------------+-----+--------------------+--------------------+----------+
|        1|        2|        3|        4|        5|        6|        7|        8|        9|       10|       11|       12|       13|       14|       15|       16|       17|       18|       19|       20|fault_detected|            features|label|       rawPrediction|         probability|prediction|
+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+------

In [0]:
# Final Prediction Output for Random Forest

rf_loaded_predictions.display()

1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,fault_detected,features,label,rawPrediction,probability,prediction
0.0253125,0.0259375,0.0284375,0.0296875,0.0340625,0.0328125,0.0328125,0.0325,0.025,0.0334375,0.035625,0.02875,0.0334375,0.03,0.030625,0.026875,0.02875,0.0359375,0.035,0.03875,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0253125, 0.0259375, 0.0284375, 0.0296875, 0.0340625, 0.0328125, 0.0328125, 0.0325, 0.025, 0.0334375, 0.035625, 0.02875, 0.0334375, 0.03, 0.030625, 0.026875, 0.02875, 0.0359375, 0.035, 0.03875))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(38.0, 32.0))","Map(vectorType -> dense, length -> 2, values -> List(0.5428571428571428, 0.45714285714285713))",0.0
0.0259375,0.0284375,0.0296875,0.0340625,0.0328125,0.0328125,0.0325,0.025,0.0334375,0.035625,0.02875,0.0334375,0.03,0.030625,0.026875,0.02875,0.0359375,0.035,0.03875,0.034375,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0259375, 0.0284375, 0.0296875, 0.0340625, 0.0328125, 0.0328125, 0.0325, 0.025, 0.0334375, 0.035625, 0.02875, 0.0334375, 0.03, 0.030625, 0.026875, 0.02875, 0.0359375, 0.035, 0.03875, 0.034375))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(41.0, 29.0))","Map(vectorType -> dense, length -> 2, values -> List(0.5857142857142857, 0.4142857142857143))",0.0
0.0259375,0.0328125,0.0340625,0.03375,0.0309375,0.0309375,0.034375,0.030625,0.029375,0.029375,0.026875,0.030625,0.0284375,0.03125,0.0271875,0.0278125,0.0290625,0.0278125,0.0309375,0.0290625,0,"Map(vectorType -> dense, length -> 20, values -> List(0.0259375, 0.0328125, 0.0340625, 0.03375, 0.0309375, 0.0309375, 0.034375, 0.030625, 0.029375, 0.029375, 0.026875, 0.030625, 0.0284375, 0.03125, 0.0271875, 0.0278125, 0.0290625, 0.0278125, 0.0309375, 0.0290625))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(39.08333333333333, 30.916666666666664))","Map(vectorType -> dense, length -> 2, values -> List(0.5583333333333332, 0.44166666666666665))",0.0
0.02625,0.025625,0.0290625,0.0265625,0.0353125,0.0259375,0.0328125,0.0340625,0.03375,0.0309375,0.0309375,0.034375,0.030625,0.029375,0.029375,0.026875,0.030625,0.0284375,0.03125,0.0271875,0,"Map(vectorType -> dense, length -> 20, values -> List(0.02625, 0.025625, 0.0290625, 0.0265625, 0.0353125, 0.0259375, 0.0328125, 0.0340625, 0.03375, 0.0309375, 0.0309375, 0.034375, 0.030625, 0.029375, 0.029375, 0.026875, 0.030625, 0.0284375, 0.03125, 0.0271875))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(36.0, 34.0))","Map(vectorType -> dense, length -> 2, values -> List(0.5142857142857142, 0.4857142857142857))",0.0
0.0265625,0.0253125,0.0259375,0.0284375,0.0296875,0.0340625,0.0328125,0.0328125,0.0325,0.025,0.0334375,0.035625,0.02875,0.0334375,0.03,0.030625,0.026875,0.02875,0.0359375,0.035,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0265625, 0.0253125, 0.0259375, 0.0284375, 0.0296875, 0.0340625, 0.0328125, 0.0328125, 0.0325, 0.025, 0.0334375, 0.035625, 0.02875, 0.0334375, 0.03, 0.030625, 0.026875, 0.02875, 0.0359375, 0.035))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(33.0, 37.0))","Map(vectorType -> dense, length -> 2, values -> List(0.4714285714285714, 0.5285714285714286))",1.0
0.0265625,0.0284375,0.02625,0.0325,0.0296875,0.0353125,0.0328125,0.0265625,0.03125,0.0328125,0.029375,0.0284375,0.0334375,0.0321875,0.0296875,0.0325,0.0284375,0.0259375,0.03125,0.02875,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0265625, 0.0284375, 0.02625, 0.0325, 0.0296875, 0.0353125, 0.0328125, 0.0265625, 0.03125, 0.0328125, 0.029375, 0.0284375, 0.0334375, 0.0321875, 0.0296875, 0.0325, 0.0284375, 0.0259375, 0.03125, 0.02875))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(45.0, 25.0))","Map(vectorType -> dense, length -> 2, values -> List(0.6428571428571429, 0.35714285714285715))",0.0
0.0265625,0.02875,0.0321875,0.0290625,0.0271875,0.0265625,0.0296875,0.0278125,0.0284375,0.0271875,0.0284375,0.0309375,0.0309375,0.02875,0.0340625,0.0296875,0.03125,0.0359375,0.671875,0.0640625,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0265625, 0.02875, 0.0321875, 0.0290625, 0.0271875, 0.0265625, 0.0296875, 0.0278125, 0.0284375, 0.0271875, 0.0284375, 0.0309375, 0.0309375, 0.02875, 0.0340625, 0.0296875, 0.03125, 0.0359375, 0.671875, 0.0640625))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(8.25, 61.75))","Map(vectorType -> dense, length -> 2, values -> List(0.11785714285714285, 0.8821428571428571))",1.0
0.0265625,0.0315625,0.0296875,0.0321875,0.0275,0.0315625,0.03625,0.03375,0.03375,0.0271875,0.030625,0.0334375,0.029375,0.02625,0.0353125,0.0346875,0.036875,0.0365625,0.03,0.0309375,1,"Map(vectorType -> dense, length -> 20, values -> List(0.0265625, 0.0315625, 0.0296875, 0.0321875, 0.0275, 0.0315625, 0.03625, 0.03375, 0.03375, 0.0271875, 0.030625, 0.0334375, 0.029375, 0.02625, 0.0353125, 0.0346875, 0.036875, 0.0365625, 0.03, 0.0309375))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(55.0, 15.0))","Map(vectorType -> dense, length -> 2, values -> List(0.7857142857142857, 0.21428571428571427))",0.0
0.026875,0.02875,0.0359375,0.035,0.03875,0.034375,0.9184375,0.873125,0.8878125,0.8090625,0.033125,0.7809375,0.77875,0.0578125,0.80625,0.038125,0.7928125,0.0390625,0.775,0.82375,1,"Map(vectorType -> dense, length -> 20, values -> List(0.026875, 0.02875, 0.0359375, 0.035, 0.03875, 0.034375, 0.9184375, 0.873125, 0.8878125, 0.8090625, 0.033125, 0.7809375, 0.77875, 0.0578125, 0.80625, 0.038125, 0.7928125, 0.0390625, 0.775, 0.82375))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(3.0, 67.0))","Map(vectorType -> dense, length -> 2, values -> List(0.04285714285714286, 0.9571428571428572))",1.0
0.026875,0.0346875,0.03125,0.0303125,0.0315625,0.02875,0.0278125,0.034375,0.0325,0.0375,0.030625,0.03625,0.0334375,0.0346875,0.0284375,0.0290625,0.0296875,0.031875,0.0315625,0.0315625,0,"Map(vectorType -> dense, length -> 20, values -> List(0.026875, 0.0346875, 0.03125, 0.0303125, 0.0315625, 0.02875, 0.0278125, 0.034375, 0.0325, 0.0375, 0.030625, 0.03625, 0.0334375, 0.0346875, 0.0284375, 0.0290625, 0.0296875, 0.031875, 0.0315625, 0.0315625))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(50.0, 20.0))","Map(vectorType -> dense, length -> 2, values -> List(0.7142857142857143, 0.2857142857142857))",0.0
