Machine dataset have specifics about the machine like model and age

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer

In [0]:
# Loading the dataset for machines
machines = spark.read.format('csv').options(header='true', inferSchema='true').load('/FileStore/tables/machines.csv')

display(machines)

machineID,model,age
1,model2,18
2,model4,7
3,model3,8
4,model3,7
5,model2,2
6,model3,7
7,model4,20
8,model3,16
9,model1,7
10,model1,10


In [0]:
# displaying the errorID count
machines.select("model","age").display()

model,age
model2,18
model4,7
model3,8
model3,7
model2,2
model3,7
model4,20
model3,16
model1,7
model1,10


Machine dataset have categorical variables we need to encode model variables to numerical values.
Here we are using one hot encoding to do it.

In [0]:
# one hot encoding of the variable model, basically creates a set of dummy boolean variables
varnames = ['model']  
sIndexers = [StringIndexer(inputCol=x, outputCol=x + '_indexed') for x in varnames]
machines_cat = Pipeline(stages=sIndexers).fit(machines).transform(machines)
print(machines_cat)
display(machines_cat)

machineID,model,age,model_indexed
1,model2,18,2.0
2,model4,7,1.0
3,model3,8,0.0
4,model3,7,0.0
5,model2,2,2.0
6,model3,7,0.0
7,model4,20,1.0
8,model3,16,0.0
9,model1,7,3.0
10,model1,10,3.0


In [0]:
# one-hot encoder
ohEncoders = [OneHotEncoder(inputCol=x + '_indexed', outputCol=x + '_encoded')
              for x in varnames]
print(ohEncoders)

In [0]:
ohPipelineModel = Pipeline(stages=ohEncoders).fit(machines_cat)
machines_cat = ohPipelineModel.transform(machines_cat)

display(machines_cat)

machineID,model,age,model_indexed,model_encoded
1,model2,18,2.0,"Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))"
2,model4,7,1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))"
3,model3,8,0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))"
4,model3,7,0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))"
5,model2,2,2.0,"Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))"
6,model3,7,0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))"
7,model4,20,1.0,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))"
8,model3,16,0.0,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))"
9,model1,7,3.0,"Map(vectorType -> sparse, length -> 3, indices -> List(), values -> List())"
10,model1,10,3.0,"Map(vectorType -> sparse, length -> 3, indices -> List(), values -> List())"


In [0]:
drop_list = [col_n for col_n in machines_cat.columns if 'indexed' in col_n]
print(drop_list)

In [0]:
#The features column is a sparse vector, which is often the case after one-hot encoding, because there are so many 0 values.
machines_feat = machines_cat.select([column for column in machines_cat.columns if column not in drop_list])

display(machines_feat)

machineID,model,age,model_encoded
1,model2,18,"Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))"
2,model4,7,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))"
3,model3,8,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))"
4,model3,7,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))"
5,model2,2,"Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))"
6,model3,7,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))"
7,model4,20,"Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))"
8,model3,16,"Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))"
9,model1,7,"Map(vectorType -> sparse, length -> 3, indices -> List(), values -> List())"
10,model1,10,"Map(vectorType -> sparse, length -> 3, indices -> List(), values -> List())"


In [0]:
print(machines_feat.count())
machines_feat.limit(10).toPandas().head(10)

Unnamed: 0,machineID,model,age,model_encoded
0,1,model2,18,"(0.0, 0.0, 1.0)"
1,2,model4,7,"(0.0, 1.0, 0.0)"
2,3,model3,8,"(1.0, 0.0, 0.0)"
3,4,model3,7,"(1.0, 0.0, 0.0)"
4,5,model2,2,"(0.0, 0.0, 1.0)"
5,6,model3,7,"(1.0, 0.0, 0.0)"
6,7,model4,20,"(0.0, 1.0, 0.0)"
7,8,model3,16,"(1.0, 0.0, 0.0)"
8,9,model1,7,"(0.0, 0.0, 0.0)"
9,10,model1,10,"(0.0, 0.0, 0.0)"
