In [11]:
import seaborn as sns
import pandas as pd
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('clasificacion_penguins').getOrCreate()
df = spark.createDataFrame(sns.load_dataset('penguins').dropna()) # Le quitamos los nulos
df.show(5)

+-------+---------+--------------+-------------+-----------------+-----------+------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|
+-------+---------+--------------+-------------+-----------------+-----------+------+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|
| Adelie|Torgersen|          40.3|         18.0|            195.0|     3250.0|Female|
| Adelie|Torgersen|          36.7|         19.3|            193.0|     3450.0|Female|
| Adelie|Torgersen|          39.3|         20.6|            190.0|     3650.0|  Male|
+-------+---------+--------------+-------------+-----------------+-----------+------+
only showing top 5 rows



## Encoding y Assembler

Para poder usar las columnas categóricas tenemos que codificarlas, como ocurría en scikit learn.

La diferencia es que para poder usar OneHotEncoder primero tenemos que usar StringIndexer, porque el OneHotEncoder de spark requiere números.

In [12]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder

# 1. StringIndexers
indexer_species = StringIndexer(inputCol='species', outputCol='species_indexed')
df = indexer_species.fit(df).transform(df)
df.show(2)

+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|species_indexed|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|            0.0|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|            0.0|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+
only showing top 2 rows



In [13]:
# columna a predecir: island
indexer_islands = StringIndexer(inputCol='island', outputCol='label')
df = indexer_islands.fit(df).transform(df)
df.show(2)

+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|species_indexed|label|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|            0.0|  2.0|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|            0.0|  2.0|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+
only showing top 2 rows



In [14]:
# columna género
indexer_sex = StringIndexer(inputCol='sex', outputCol='sex_indexed')
df = indexer_sex.fit(df).transform(df)
df.show(2)

+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|species_indexed|label|sex_indexed|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|            0.0|  2.0|        0.0|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|            0.0|  2.0|        1.0|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+
only showing top 2 rows



In [15]:
# 2. OneHotEncoder sobre las categóricas de la entrada que usaremos en features: species_indexed, sex_indexed
# Es obligatorio haber hecho StringIndexer para tener las categóricas como índices numéricos, si no dará error IllegalArgumentException

encoder = OneHotEncoder(
    inputCols=['species_indexed', 'sex_indexed'],
    outputCols=['species_onehot', 'sex_onehot']
)
df = encoder.fit(df).transform(df)
df.show(2) # Genera nuevas columnas cada una de ellas es un SparseVector, que ahorra espacio en memoria en comparación con vector denso

+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+--------------+-------------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|species_indexed|label|sex_indexed|species_onehot|   sex_onehot|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+--------------+-------------+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|            0.0|  2.0|        0.0| (2,[0],[1.0])|(1,[0],[1.0])|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|            0.0|  2.0|        1.0| (2,[0],[1.0])|    (1,[],[])|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+--------------+-------------+
only showing top 2 rows



In [17]:
# 3. VectorAssembler
assembler = VectorAssembler(
    inputCols=['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'species_onehot', 'sex_onehot'],
    outputCol='features'
)
df = assembler.transform(df)
df.show(2)

+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+--------------+-------------+--------------------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|species_indexed|label|sex_indexed|species_onehot|   sex_onehot|            features|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+--------------+-------------+--------------------+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|            0.0|  2.0|        0.0| (2,[0],[1.0])|(1,[0],[1.0])|[39.1,18.7,181.0,...|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|            0.0|  2.0|        1.0| (2,[0],[1.0])|    (1,[],[])|[39.5,17.4,186.0,...|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+--------------+-------------+--

In [19]:
# Quedarnos con features y label para poder hacer modelado
df_to_predict = df.select('features', 'label')
df_to_predict.show(2)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[39.1,18.7,181.0,...|  2.0|
|[39.5,17.4,186.0,...|  2.0|
+--------------------+-----+
only showing top 2 rows



In [20]:
df_train, df_test = df_to_predict.randomSplit([0.8, 0.2], seed=42)

In [23]:
from pyspark.ml.classification import (
    LogisticRegression, 
    DecisionTreeClassifier, 
    RandomForestClassifier, 
    GBTClassifier, 
    MultilayerPerceptronClassifier
)
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [22]:
lr = LogisticRegression()
model = lr.fit(df_train)
df_pred = model.transform(df_test)
df_pred.show(3)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[35.9,19.2,189.0,...|  0.0|[-0.3957289061333...|[0.21635034123350...|       2.0|
|[37.8,18.3,174.0,...|  0.0|[0.83150611497343...|[0.62530484025805...|       0.0|
|[38.6,21.2,191.0,...|  2.0|[-0.1358787517031...|[0.28962512108041...|       1.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 3 rows



In [32]:
evaluator_accuracy = MulticlassClassificationEvaluator(metricName='accuracy')
evaluator_f1 = MulticlassClassificationEvaluator(metricName='f1')
evaluator_precision = MulticlassClassificationEvaluator(metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(metricName='weightedRecall')
# No tiene AUC para multiclass: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.evaluation.MulticlassClassificationEvaluator.html#pyspark.ml.evaluation.MulticlassClassificationEvaluator.metricName
# evaluator_auc = MulticlassClassificationEvaluator(metricName='areaUnderROC')
# Alternativa, sacar manualmente TP, FP, TN, FN

In [33]:
print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

accuracy 0.6612903225806451
f1 0.6618494892809561
precision 0.6808542413381122
recall 0.6612903225806451


In [35]:
# Red neuronal de clasificación multiclase

num_features = df_train.first()['features'].size
num_labels = df.select('label').distinct().count()
# layers=[capa input, capas ocultas..., capa output]
mlp = MultilayerPerceptronClassifier(layers=[num_features, 32, 32, num_labels], seed=42, maxIter=10)
model = mlp.fit(df_train)
df_pred = model.transform(df_test)
print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

accuracy 0.532258064516129
f1 0.3697792869269949
precision 0.28329864724245574
recall 0.532258064516129


In [37]:
# https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.GBTClassifier.html
# Multiclass labels are not currently supported.
# De momento solo sirve para clasificación binaria, si tenemos más de 2 labels lanza error
# gpt = GBTClassifier(seed=42)
# model = gpt.fit(df_train)
# df_pred = model.transform(df_test)
# print('accuracy', evaluator_accuracy.evaluate(df_pred))
# print('f1', evaluator_f1.evaluate(df_pred))
# print('precision', evaluator_precision.evaluate(df_pred))
# print('recall', evaluator_recall.evaluate(df_pred))

In [38]:
# posible solucion: 
# convertir un problema multiclase en varios problemas binarios
# utilizando OneVsRest, que entrena múltiples modelos GBTClassifier, 
# uno para cada clase contra el resto
from pyspark.ml.classification import OneVsRest

gbt = GBTClassifier()

ovr = OneVsRest(classifier=gbt)
model = ovr.fit(df_train)
df_pred = model.transform(df_test)

print('accuracy', evaluator_accuracy.evaluate(df_pred))
print('f1', evaluator_f1.evaluate(df_pred))
print('precision', evaluator_precision.evaluate(df_pred))
print('recall', evaluator_recall.evaluate(df_pred))

accuracy 0.6129032258064516
f1 0.6093997421164548
precision 0.6165703040007823
recall 0.6129032258064516
