In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

spark = SparkSession.builder.appName('mytree').getOrCreate()

In [0]:
data = spark.read.csv('/FileStore/tables/dog_food.csv', inferSchema=True, header=True)

In [0]:
data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [0]:
data.describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                0.0|
|    max|                10|                10|              14.0|                10|                1.0|
+-------+------------------+------------------+------------------+------------------+-------------------+



In [0]:
data.show()

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows



In [0]:
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol='features')

In [0]:
output = assembler.transform(data)

In [0]:
output.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)



In [0]:
train_data, test_data = output.randomSplit([0.7, 0.3])

In [0]:
rfc = RandomForestClassifier(labelCol='Spoiled' ,featuresCol='features')

In [0]:
rfc_model = rfc.fit(train_data)

In [0]:
rfc_preds = rfc_model.transform(test_data)

In [0]:
rfc_preds.show()

+---+---+----+---+-------+-------------------+--------------------+--------------------+----------+
|  A|  B|   C|  D|Spoiled|           features|       rawPrediction|         probability|prediction|
+---+---+----+---+-------+-------------------+--------------------+--------------------+----------+
|  1|  1|12.0|  2|    1.0| [1.0,1.0,12.0,2.0]|          [0.0,20.0]|           [0.0,1.0]|       1.0|
|  1|  3| 8.0|  3|    0.0|  [1.0,3.0,8.0,3.0]|[19.8178769331190...|[0.99089384665595...|       0.0|
|  1|  3| 9.0|  8|    0.0|  [1.0,3.0,9.0,8.0]|[19.8178769331190...|[0.99089384665595...|       0.0|
|  1|  4| 8.0|  1|    0.0|  [1.0,4.0,8.0,1.0]|[19.7437895888984...|[0.98718947944492...|       0.0|
|  1|  4| 8.0|  7|    0.0|  [1.0,4.0,8.0,7.0]|[19.8178769331190...|[0.99089384665595...|       0.0|
|  1|  4| 9.0|  6|    0.0|  [1.0,4.0,9.0,6.0]|[19.8178769331190...|[0.99089384665595...|       0.0|
|  1|  5|13.0| 10|    1.0|[1.0,5.0,13.0,10.0]|          [1.0,19.0]|         [0.05,0.95]|       1.0|


In [0]:
my_binary_eval = BinaryClassificationEvaluator(labelCol='Spoiled')

In [0]:
print('Random Forest Evaluator: ')
print(my_binary_eval.evaluate(rfc_preds))

Random Forest Evaluator: 
0.9922182821118991


In [0]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="Spoiled", predictionCol="prediction", metricName="accuracy")

In [0]:
rfc_acc = acc_evaluator.evaluate(rfc_preds)

In [0]:
rfc_model.featureImportances

Out[24]: SparseVector(4, {0: 0.0285, 1: 0.026, 2: 0.9224, 3: 0.0232})

In [0]:
print("Here are the results!")
print('-'*80)
print('A random forest ensemble had an accuracy of: {0:2.2f}%'.format(rfc_acc*100))

Here are the results!
--------------------------------------------------------------------------------
A random forest ensemble had an accuracy of: 98.71%


Given that feature importance shows that Chemical C has the highest score it can be concluded that it is the preservative that is causing the spoil