In [0]:
%run ./Failure_Data_Analysis

In [0]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import LogisticRegression

Prepare the Training data

A fundamental practice in machine learning is to calibrate and test your model parameters on data that has not been used to train the model.


Evaluation of the model requires splitting the available data into a training portion, a calibration portion and an evaluation portion.

For splitting we are using time dependent splitting because we want to train our model based on time pattern of failure. random splitting doesnt work.

In [0]:
#Prepare the Training and Testing data

# We'll use the known label, and key variables.
label_var = ['label_e']
key_cols =['machineID','df_time']

In [0]:
# get the remaing feature names from the data
input_features = labeled_features.columns

In [0]:
# remove unwanted columns
remove_names = label_var + key_cols + ['failure','model_encoded','model']
display(remove_names)

In [0]:
# Remove the extra names if that are in the input_features list
input_features = [x for x in input_features if x not in set(remove_names)]
display(input_features)

the dataset here and then split the data into a training and test set.
We use this split data to train the model on 9 months of data (training data), and evaluate on the remaining 3 months (test data) going forward

In [0]:
# assemble features
va = VectorAssembler(inputCols=(input_features), outputCol='features')
labeled_features = va.transform(labeled_features).select('machineID','df_time','label_e','features')
display(labeled_features)

In [0]:
# set maxCategories so features with > 10 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10).fit(labeled_features)
display(featureIndexer)

In [0]:
# fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol="label_e", outputCol="indexedLabel").fit(labeled_features)
display(labelIndexer)

In [0]:
# split the data into train/test based on date
split_date = "2015-10-30"
training = labeled_features.filter(labeled_features.df_time < split_date)
testing =  labeled_features.filter(labeled_features.df_time >= split_date)
display(training)
display(testing)

Here i am using 3 machine learning algorithms

In [0]:
model = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",maxDepth=15,
                                      maxBins=32,minInstancesPerNode=1,minInfoGain=0.0,impurity="gini")

# chain indexers and model in a Pipeline
pipeline_cls_mthd = Pipeline(stages=[labelIndexer, featureIndexer, model])

# train model.  This also runs the indexers.
model_pipeline = pipeline_cls_mthd.fit(training)

# make predictions. The Pipeline does all the same operations on the test data
predictions = model_pipeline.transform(testing)

# confusion matrix for the multiclass prediction results
conf_table = predictions.stat.crosstab('indexedLabel', 'prediction')

confuse = conf_table.toPandas()

# True positives - diagonal failure terms 
tp = confuse['1.0'][1]+confuse['2.0'][2]+confuse['3.0'][3]+confuse['4.0'][4]

# False positves - All failure terms - True positives
fp = np.sum(np.sum(confuse[['1.0', '2.0','3.0','4.0']])) - tp

# True negatives 
tn = confuse['0.0'][0]

# False negatives total of non-failure column - TN
fn = np.sum(np.sum(confuse[['0.0']])) - tn

# Accuracy is diagonal/total 
acc_n = tn + tp
acc_d = np.sum(np.sum(confuse[['0.0','1.0', '2.0','3.0','4.0']]))
acc = acc_n/acc_d

# Calculate precision and recall.
prec = tp/(tp+fp)
rec = tp/(tp+fn)

# Print the evaluation metrics
display(confuse)
print("Accuracy = %g" % acc)
print("Precision = %g" % prec)
print("Recall = %g" % rec )
print("F1 = %g" % (2.0 * prec * rec/(prec + rec)))

In [0]:
model = LogisticRegression(labelCol="indexedLabel", featuresCol="indexedFeatures",maxIter=10)

# chain indexers and model in a Pipeline
pipeline_cls_mthd = Pipeline(stages=[labelIndexer, featureIndexer, model])

# train model.  This also runs the indexers.
model_pipeline = pipeline_cls_mthd.fit(training)

# make predictions. The Pipeline does all the same operations on the test data
predictions = model_pipeline.transform(testing)

# confusion matrix for the multiclass prediction results
conf_table = predictions.stat.crosstab('indexedLabel', 'prediction')

confuse = conf_table.toPandas()

# True positives - diagonal failure terms 
tp = confuse['1.0'][1]+confuse['2.0'][2]+confuse['3.0'][3]+confuse['4.0'][4]

# False positves - All failure terms - True positives
fp = np.sum(np.sum(confuse[['1.0', '2.0','3.0','4.0']])) - tp

# True negatives 
tn = confuse['0.0'][0]

# False negatives total of non-failure column - TN
fn = np.sum(np.sum(confuse[['0.0']])) - tn

# Accuracy is diagonal/total 
acc_n = tn + tp
acc_d = np.sum(np.sum(confuse[['0.0','1.0', '2.0','3.0','4.0']]))
acc = acc_n/acc_d

# Calculate precision and recall.
prec = tp/(tp+fp)
rec = tp/(tp+fn)

# Print the evaluation metrics
display(confuse)
print("Accuracy = %g" % acc)
print("Precision = %g" % prec)
print("Recall = %g" % rec )
print("F1 = %g" % (2.0 * prec * rec/(prec + rec)))

In [0]:
model = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",maxDepth=15,maxBins=32,minInstancesPerNode=1,minInfoGain=0.0,impurity="gini",numTrees=50,featureSubsetStrategy="sqrt",subsamplingRate = 0.632)

# chain indexers and model in a Pipeline
pipeline_cls_mthd = Pipeline(stages=[labelIndexer, featureIndexer, model])

# train model.  This also runs the indexers.
model_pipeline = pipeline_cls_mthd.fit(training)

# make predictions. The Pipeline does all the same operations on the test data
predictions = model_pipeline.transform(testing)

# confusion matrix for the multiclass prediction results
conf_table = predictions.stat.crosstab('indexedLabel', 'prediction')

confuse = conf_table.toPandas()

# True positives - diagonal failure terms 
tp = confuse['1.0'][1]+confuse['2.0'][2]+confuse['3.0'][3]+confuse['4.0'][4]

# False positves - All failure terms - True positives
fp = np.sum(np.sum(confuse[['1.0', '2.0','3.0','4.0']])) - tp

# True negatives 
tn = confuse['0.0'][0]

# False negatives total of non-failure column - TN
fn = np.sum(np.sum(confuse[['0.0']])) - tn

# Accuracy is diagonal/total 
acc_n = tn + tp
acc_d = np.sum(np.sum(confuse[['0.0','1.0', '2.0','3.0','4.0']]))
acc = acc_n/acc_d

# Calculate precision and recall.
prec = tp/(tp+fp)
rec = tp/(tp+fn)

# Print the evaluation metrics
display(confuse)
print("Accuracy = %g" % acc)
print("Precision = %g" % prec)
print("Recall = %g" % rec )
print("F1 = %g" % (2.0 * prec * rec/(prec + rec)))