In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://mirrors.sonic.net/apache/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xzf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"


import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [2]:
#Comprobar la sesión de Spark
spark

In [4]:
#Cargar archivo
df = spark.read.format('csv').options(inferSchema=True, header=True).load('dog_food.csv')

In [5]:
df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [6]:
#Observamos que no hay nulos y que todos los valores son numéricos
df.describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                0.0|
|    max|                10|                10|              14.0|                10|                1.0|
+-------+------------------+------------------+------------------+------------------+-------------------+



In [7]:
#Columnas del dataFrame
df.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [8]:
#Creamos un vectorAssembler para agrupar todas las columnas features
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
    inputCols=[
        'A',
        'B',
        'C',
        'D'],
        outputCol='features')
output = assembler.transform(df)

#Cambiamos el nombre a la columna Spoiled a label para que concuerde con los datos de entrada de los árboles
final_data = output.selectExpr("Spoiled as label", "features as features")
#Mostramos el dataFrame final
final_data.show()

+-----+-------------------+
|label|           features|
+-----+-------------------+
|  1.0| [4.0,2.0,12.0,3.0]|
|  1.0| [5.0,6.0,12.0,7.0]|
|  1.0| [6.0,2.0,13.0,6.0]|
|  1.0| [4.0,2.0,12.0,1.0]|
|  1.0| [4.0,2.0,12.0,3.0]|
|  1.0|[10.0,3.0,13.0,9.0]|
|  1.0| [8.0,5.0,14.0,5.0]|
|  1.0| [5.0,8.0,12.0,8.0]|
|  1.0| [6.0,5.0,12.0,9.0]|
|  1.0| [3.0,3.0,12.0,1.0]|
|  1.0| [9.0,8.0,11.0,3.0]|
|  1.0|[1.0,10.0,12.0,3.0]|
|  1.0|[1.0,5.0,13.0,10.0]|
|  1.0|[2.0,10.0,12.0,6.0]|
|  1.0|[1.0,10.0,11.0,4.0]|
|  1.0| [5.0,3.0,12.0,2.0]|
|  1.0| [4.0,9.0,11.0,8.0]|
|  1.0| [5.0,1.0,11.0,1.0]|
|  1.0|[4.0,9.0,12.0,10.0]|
|  1.0| [5.0,8.0,10.0,9.0]|
+-----+-------------------+
only showing top 20 rows



In [9]:
#Importar librerias de clasificación relacionadas con árboles de decisión
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier,
                                       DecisionTreeClassifier)

In [10]:
#Separamos el conjunto de datos en datos de entrenamiento y de testeo
train, test = final_data.randomSplit([0.7, 0.3])

In [11]:
#Instancia de las clases
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees = 100)
gbt = GBTClassifier()

In [12]:
#Creamos los modelos para cada una de las clases a partir del conjunto de datos de enternamiento
dtc_model = dtc.fit(train)
rfc_model = rfc.fit(train)
gbt_model = gbt.fit(train)

In [13]:
#Creamos las predicciones a partir de los modelos creados
dtc_preds = dtc_model.transform(test)
rfc_preds = rfc_model.transform(test)
gbt_preds = gbt_model.transform(test)

In [14]:
#Mostramos las predicciones
dtc_preds.show()

+-----+------------------+-------------+--------------------+----------+
|label|          features|rawPrediction|         probability|prediction|
+-----+------------------+-------------+--------------------+----------+
|  0.0| [1.0,3.0,8.0,3.0]|  [198.0,0.0]|           [1.0,0.0]|       0.0|
|  0.0| [1.0,4.0,8.0,7.0]|  [198.0,0.0]|           [1.0,0.0]|       0.0|
|  0.0| [1.0,4.0,9.0,3.0]|  [198.0,0.0]|           [1.0,0.0]|       0.0|
|  0.0| [1.0,4.0,9.0,6.0]|  [198.0,0.0]|           [1.0,0.0]|       0.0|
|  0.0| [1.0,7.0,7.0,2.0]|   [34.0,0.0]|           [1.0,0.0]|       0.0|
|  0.0| [1.0,7.0,8.0,4.0]|  [198.0,0.0]|           [1.0,0.0]|       0.0|
|  0.0|[1.0,8.0,7.0,10.0]|  [198.0,0.0]|           [1.0,0.0]|       0.0|
|  0.0| [2.0,1.0,8.0,9.0]|  [198.0,0.0]|           [1.0,0.0]|       0.0|
|  0.0| [2.0,1.0,9.0,1.0]|    [6.0,0.0]|           [1.0,0.0]|       0.0|
|  0.0|[2.0,1.0,10.0,7.0]|   [11.0,3.0]|[0.78571428571428...|       0.0|
|  0.0| [2.0,2.0,8.0,1.0]|    [6.0,0.0]|           

In [15]:
#Evaluamos la accuracy de las predicciones mediante un multiclass clasification evaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')

In [16]:
#Resultado final
print(f'DTC: {evaluator.evaluate(dtc_preds)}') 
print(f'RFC: {evaluator.evaluate(rfc_preds)}')
print(f'GBT: {evaluator.evaluate(gbt_preds)}')

DTC: 0.9772727272727273
RFC: 0.9924242424242424
GBT: 0.9696969696969697
