In [None]:
appname = "Dog_food_assignment"

# Look into https://spark.apache.org/downloads.html for the latest version
spark_mirror = "https://mirrors.sonic.net/apache/spark"
spark_version = "3.3.1"
hadoop_version = "3"

# Install Java 8 (Spark does not work with newer Java versions)
! apt-get update
! apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Download and extract Spark binary distribution
! rm -rf spark-{spark_version}-bin-hadoop{hadoop_version}.tgz spark-{spark_version}-bin-hadoop{hadoop_version}
! wget -q {spark_mirror}/spark-{spark_version}/spark-{spark_version}-bin-hadoop{hadoop_version}.tgz
! tar xzf spark-{spark_version}-bin-hadoop{hadoop_version}.tgz

# The only 2 environment variables needed to set up Java and Spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/spark-{spark_version}-bin-hadoop{hadoop_version}"

# Set up the Spark environment based on the environment variable SPARK_HOME
! pip install -q findspark
import findspark
findspark.init()

# Get the Spark session object (basic entry point for every operation)
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName(appname).master("local[*]").getOrCreate()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Get:5 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:7 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:10 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Packages [1,038 kB]
Get:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 kB]
Hit:13 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:1

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = spark.read.format('csv').options(inferSchema=True, header=True).load('/content/drive/MyDrive/Colab Notebooks/dog_food.csv')

Printing some basic stats about our data

In [None]:
df.describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                0.0|
|    max|                10|                10|              14.0|                10|                1.0|
+-------+------------------+------------------+------------------+------------------+-------------------+



In [None]:
df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [None]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['A','B','C','D'], outputCol='features')

final_data = assembler.transform(df)

In [None]:
final_data.show(5)

+---+---+----+---+-------+------------------+
|  A|  B|   C|  D|Spoiled|          features|
+---+---+----+---+-------+------------------+
|  4|  2|12.0|  3|    1.0|[4.0,2.0,12.0,3.0]|
|  5|  6|12.0|  7|    1.0|[5.0,6.0,12.0,7.0]|
|  6|  2|13.0|  6|    1.0|[6.0,2.0,13.0,6.0]|
|  4|  2|12.0|  1|    1.0|[4.0,2.0,12.0,1.0]|
|  4|  2|12.0|  3|    1.0|[4.0,2.0,12.0,3.0]|
+---+---+----+---+-------+------------------+
only showing top 5 rows



In [None]:
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier,
                                       DecisionTreeClassifier)
print(RandomForestClassifier().explainParams())

bootstrap: Whether bootstrap samples are used when building trees. (default: True)
cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default: False)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the featur

In [None]:
train, test = final_data.randomSplit([0.75, 0.25])

In [None]:
dtc = DecisionTreeClassifier(labelCol = "Spoiled", impurity="entropy",seed=-476609685677241813)
rfc = RandomForestClassifier(labelCol = "Spoiled", numTrees = 100, impurity="entropy", seed=-476609685677241813)
gbt = GBTClassifier(labelCol = "Spoiled", featuresCol = "features", seed=-476609685677241813)

In [None]:
dtc_model = dtc.fit(train)
rfc_model = rfc.fit(train)
gbt_model = gbt.fit(train)

In [None]:
dtc_preds = dtc_model.transform(test)
rfc_preds = rfc_model.transform(test)
gbt_preds = gbt_model.transform(test)

In [None]:
dtc_preds.show(10)

+---+---+----+---+-------+-------------------+-------------+--------------------+----------+
|  A|  B|   C|  D|Spoiled|           features|rawPrediction|         probability|prediction|
+---+---+----+---+-------+-------------------+-------------+--------------------+----------+
|  1|  1|12.0|  4|    1.0| [1.0,1.0,12.0,4.0]|   [0.0,86.0]|           [0.0,1.0]|       1.0|
|  1|  2| 9.0|  1|    0.0|  [1.0,2.0,9.0,1.0]|  [240.0,1.0]|[0.99585062240663...|       0.0|
|  1|  4| 9.0|  6|    0.0|  [1.0,4.0,9.0,6.0]|  [240.0,1.0]|[0.99585062240663...|       0.0|
|  1|  5|12.0| 10|    1.0|[1.0,5.0,12.0,10.0]|   [0.0,10.0]|           [0.0,1.0]|       1.0|
|  1|  8| 7.0| 10|    0.0| [1.0,8.0,7.0,10.0]|  [240.0,1.0]|[0.99585062240663...|       0.0|
|  1|  8| 8.0|  8|    0.0|  [1.0,8.0,8.0,8.0]|  [240.0,1.0]|[0.99585062240663...|       0.0|
|  1|  9| 7.0|  5|    0.0|  [1.0,9.0,7.0,5.0]|  [240.0,1.0]|[0.99585062240663...|       0.0|
|  1|  9|10.0|  6|    0.0| [1.0,9.0,10.0,6.0]|    [0.0,1.0]|          

Testing the accuracy of our model

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', labelCol = "Spoiled")

In [None]:
print(f'DTC: {evaluator.evaluate(dtc_preds)}')
print(f'RFC: {evaluator.evaluate(rfc_preds)}')
print(f'GBT: {evaluator.evaluate(gbt_preds)}')

DTC: 0.9596774193548387
RFC: 0.9758064516129032
GBT: 0.967741935483871


These accuracies are very high, which helps us consider they have a closeness to reality. Therefore, we can extract what variables was the most influential in the predictions

In [None]:
#Found in https://stackoverflow.com/questions/28971989/pyspark-mllib-random-forest-feature-importances
va = assembler
#display(dtc_model)
#print(dtc_model.toDebugString) #print the nodes of the decision tree model

list(zip(va.getInputCols(), dtc_model.featureImportances, rfc_model.featureImportances, gbt_model.featureImportances),)

[('A', 0.014005953378647792, 0.024730775253754205, 0.03711618878251018),
 ('B', 0.0025391503587106233, 0.020541113442031023, 0.052742905096166956),
 ('C', 0.9825814285392451, 0.9271062101149683, 0.8764795573465588),
 ('D', 0.0008734677233964758, 0.027621901189246473, 0.033661348774764024)]

As it can be seen, the most influential chemical was C with an importance of:

*   DecisionTreeClassifier: 98%
*   RandomTreeClassifier: 93%
*   GBTClassifier: 88%

Therefore, chemical C should be the one affecting food spoiling.


