

# Spark ML Clasificación

Cargamos un dataset con información sobre señales de procesos. En estas señales se producen partículas supersimétricas y ruido, lo cual está indicado en el dataset.

In [1]:
import os
os.environ['PYSPARK_PYTHON'] = '/usr/local/bin/python3.6'



### Crear SparkSession

In [2]:
# Respuesta

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()



### Cargar datos y comprobar schema

In [3]:
# Respuesta

particles = spark.read.csv('Data/susy.csv', sep=',', header=True, inferSchema=True)

particles.printSchema()

root
 |-- label: double (nullable = true)
 |-- lepton1_pt: double (nullable = true)
 |-- lepton1_eta: double (nullable = true)
 |-- lepton1_phi: double (nullable = true)
 |-- lepton2_pt: double (nullable = true)
 |-- lepton2_eta: double (nullable = true)
 |-- lepton2_phi: double (nullable = true)
 |-- missing_energy_magnitude: double (nullable = true)
 |-- missing_energy_phi: double (nullable = true)
 |-- met_rel: double (nullable = true)
 |-- axial_met: double (nullable = true)
 |-- m_r: double (nullable = true)
 |-- m_tr?: double (nullable = true)
 |-- r: double (nullable = true)
 |-- mt2: double (nullable = true)
 |-- s_r: double (nullable = true)
 |-- m_delta_r: double (nullable = true)
 |-- dphi_r_b: double (nullable = true)
 |-- cos(theta_r1): double (nullable = true)



In [4]:
# Respuesta

particles.show()

+-----+-------------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+-------------------+--------------------+
|label|         lepton1_pt|         lepton1_eta|         lepton1_phi|         lepton2_pt|         lepton2_eta|         lepton2_phi|missing_energy_magnitude|  missing_energy_phi|             met_rel|           axial_met|                m_r|              m_tr?|                  r|                 mt2|                s_r|          m_delta_r|           dphi_r_b|       cos(theta_r1)|
+-----+-------------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------------+-----------------



* Verificar valores nulos

In [5]:
# Respuesta

from pyspark.sql import functions as F

for column in particles.columns:
    if particles.where(F.col(column).isNull()).count() != 0:
        print("\tBe careful: there are null values in the column '{}'".format(column))
    else:
        print("The column '{}' does not have null values".format(column))

The column 'label' does not have null values
The column 'lepton1_pt' does not have null values
The column 'lepton1_eta' does not have null values
The column 'lepton1_phi' does not have null values
The column 'lepton2_pt' does not have null values
The column 'lepton2_eta' does not have null values
The column 'lepton2_phi' does not have null values
The column 'missing_energy_magnitude' does not have null values
The column 'missing_energy_phi' does not have null values
The column 'met_rel' does not have null values
The column 'axial_met' does not have null values
The column 'm_r' does not have null values
The column 'm_tr?' does not have null values
The column 'r' does not have null values
The column 'mt2' does not have null values
The column 's_r' does not have null values
The column 'm_delta_r' does not have null values
The column 'dphi_r_b' does not have null values
The column 'cos(theta_r1)' does not have null values




Nos disponemos a lanzar un algoritmo de clasificación para categorizar las señales como ruido o partículas supersimétricas



#### Pasos previos

* VectorAssembler con variables deseadas

Se toman todas aquellas que son numéricas menos la objetivo (en este caso es 'label')

In [6]:
# Respuesta

from pyspark.ml.feature import VectorAssembler

variables_vector_assembler = [element for element in particles.columns if element != 'label']

vector_assemmbler = VectorAssembler(inputCols = variables_vector_assembler, outputCol = 'assembled_features')

particles = vector_assemmbler.transform(particles)

particles.show()

+-----+-------------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+-------------------+--------------------+--------------------+
|label|         lepton1_pt|         lepton1_eta|         lepton1_phi|         lepton2_pt|         lepton2_eta|         lepton2_phi|missing_energy_magnitude|  missing_energy_phi|             met_rel|           axial_met|                m_r|              m_tr?|                  r|                 mt2|                s_r|          m_delta_r|           dphi_r_b|       cos(theta_r1)|  assembled_features|
+-----+-------------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------------------+--------------------+-----------------



- Partir dataset entre train y test

In [7]:
# Respuesta

particles_train, particles_test = particles.randomSplit([0.8,0.2])



### Regresión Logística

In [12]:
# Respuesta

from pyspark.ml.classification import LogisticRegression

thld_label_1 = 0.45 # Try other values, like:  0.01 , 0.15, 0.30, 0.5 (default)
logistic_regression = LogisticRegression(featuresCol= 'assembled_features', labelCol='label', threshold=thld_label_1 )
print ("Logistic regression threshold for 'label = 1.0' is: ",logistic_regression.getThreshold())

logistic_regression_model = logistic_regression.fit(particles_train)
print("Logistic regression coefficients: " + str(logistic_regression_model.coefficientMatrix))
print("Logistic regression intercept: " + str(logistic_regression_model.interceptVector))

particles_logistic_regression = logistic_regression_model.transform(particles_test)

particles_logistic_regression.show(5, truncate=False)

Logistic regression threshold for 'label = 1.0' is:  0.4
Logistic regression coefficients: DenseMatrix([[ 2.32956856e+00,  3.17468536e-03, -3.60866203e-03,
               4.49716865e-01, -2.34635637e-03, -2.14429354e-03,
               4.69166190e+00, -9.47318997e-04, -4.19164258e-01,
               3.04251699e-01,  2.91315677e-01, -1.59194299e+00,
              -1.71723384e+00,  1.06278579e-01, -2.10461597e+00,
               5.20508298e-01, -6.04997614e-01,  1.08135410e+00]])
Logistic regression intercept: [-1.6740692038683986]
+-----+-------------------+--------------------+---------------------+-------------------+--------------------+-------------------+------------------------+-------------------+------------------+--------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+--------------------+--------------------------------------------------------------------------------------------



Lo primero que debemos entender es que el modelo regresión logística de la librería Spark, busca clasificar variables _target_ 'label = 1.0'. Esta clasificación tiene un umbral de corte de probabilidad y está asociado con el parámetro _threshold_ (se recomienda probar distintos valores).

Otra cosa importante del modelo  regresión logística, es que cuando  se realiza para predicción con el método *transform* se obtienen 3 columnas: *rawPrediction*, *probability*  y *prediction*

A modo de ejemplo, se analiza un registro:

| rawPrediction | probability | prediction |
| :----------: | :----------: | :----------: |
| [3.38, -3.38] | [0.96, 0.03]| 0.0|

En la columna *rawPrediction* existen dos valores, el valor -3.38 sale de aplicar la siguiente formula: $\beta X +\beta_0$ Los valores en la columna *probability* salen de aplicar la función sigmoide. Finalmente la columna de *prediction* aplica el _threshold=0.45_ sobre el segundo valor de probabilidad (es decir 0.03) y determinar que dicho registro debe ser clasificado como 0.0




### Random Forest

In [14]:
# Respuesta

from pyspark.ml.classification import RandomForestClassifier

random_forest = RandomForestClassifier(featuresCol= 'assembled_features', labelCol='label',
                                      maxDepth=8, numTrees=128, impurity="gini")

random_forest_model = random_forest.fit(particles_train)
print('Learned classification random forest model:')
print("\t",random_forest_model.getNumTrees)
print("\t",random_forest_model.featureImportances)

particles_random_forest = random_forest_model.transform(particles_test)

particles_random_forest.show()

+-----+-------------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------------------+--------------------+-------------------+--------------------+-------------------+-------------------+------------------+-------------------+-------------------+-------------------+------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|         lepton1_pt|         lepton1_eta|         lepton1_phi|         lepton2_pt|         lepton2_eta|         lepton2_phi|missing_energy_magnitude|  missing_energy_phi|            met_rel|           axial_met|                m_r|              m_tr?|                 r|                mt2|                s_r|          m_delta_r|          dphi_r_b|       cos(theta_r1)|  assembled_features|       rawPrediction|         probability|prediction|
+-----+-------------------+--------------------+--------------------+-------------------+-------



### Gradient Boosting Trees

In [28]:
# Respuesta

from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(featuresCol= 'assembled_features', labelCol='label', maxIter=8, maxDepth=10, seed=1023)

gbt_model = gbt.fit(particles_train)

particles_gbt = gbt_model.transform(particles_test)

particles_gbt.show()

+-----+-------------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------------------+--------------------+-------------------+--------------------+-------------------+-------------------+------------------+-------------------+-------------------+-------------------+------------------+--------------------+--------------------+----------+
|label|         lepton1_pt|         lepton1_eta|         lepton1_phi|         lepton2_pt|         lepton2_eta|         lepton2_phi|missing_energy_magnitude|  missing_energy_phi|            met_rel|           axial_met|                m_r|              m_tr?|                 r|                mt2|                s_r|          m_delta_r|          dphi_r_b|       cos(theta_r1)|  assembled_features|prediction|
+-----+-------------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------------------+--------------------+---


# Evaluación de los modelos



Importamos las librerias necesarias para evaluar los modelos

In [30]:
# Respuesta

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics



**Para la AUC**

Una curva ROC (curva de característica operativa del receptor) es un gráfico que muestra el rendimiento de un modelo de clasificación en todos los umbrales (_thresholds_) de clasificación. Cuanto mayor sea esta medida, mejor será nuestro modelo capaz de diferenciar entre classes.

In [31]:
# Respuesta

auc = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='label', metricName='areaUnderROC')



Imprimir resultados para los distintos modelos

In [12]:
# Respuesta

print("AUC logsitic regression: {}".format(auc.evaluate(particles_logistic_regression)))

print("AUC random forest: {}".format(auc.evaluate(particles_random_forest)))

# Please note that Spark 2.1 has not "rawPrediction" for GBT
# print("AUC gbt: {}".format(auc.evaluate(particles_gbt)))

AUC logsitic regression: 0.85798210090464
AUC random forest: 0.8458421437573672




***Para otras métricas***

- aquí se llamará una función de la librería mllib. Por lo tanto, se deberá trabajar con un RDD y no con un DataFrame



Regresión Logísitca

In [13]:
# Respuesta

metrics = MulticlassMetrics(particles_logistic_regression.select('prediction', 'label').rdd)

recall = metrics.recall(label=1)
precision = metrics.precision(label=1)
f1 = metrics.fMeasure()
confusion_matrix = metrics.confusionMatrix()

print("Recall: {}".format(recall))
print("Precision: {}".format(precision))
print("f1: {}".format(f1))
print("Confusion matrix: {}".format(confusion_matrix))



Recall: 0.6770936966561594
Precision: 0.8300276012581038
f1: 0.7890095791585086
Confusion matrix: DenseMatrix([[100162.,  13240.],
             [ 30834.,  64655.]])




Random Forest

In [14]:
# Respuesta

metrics = MulticlassMetrics(particles_random_forest.select('prediction', 'label').rdd)

recall = metrics.recall(label=1)
precision = metrics.precision(label=1)
f1 = metrics.fMeasure()
confusion_matrix = metrics.confusionMatrix()

print("Recall: {}".format(recall))
print("Precision: {}".format(precision))
print("f1: {}".format(f1))
print("Confusion matrix: {}".format(confusion_matrix))



Recall: 0.6558347034736985
Precision: 0.8179858934169278
f1: 0.7759644982311349
Confusion matrix: DenseMatrix([[99467., 13935.],
             [32864., 62625.]])




GBT

In [18]:
# Respuesta
metrics = MulticlassMetrics(particles_gbt.select('prediction', 'label').rdd)

recall = metrics.recall(label=1)
precision = metrics.precision(label=1)
f1 = metrics.fMeasure()
confusion_matrix = metrics.confusionMatrix()

print("Recall: {}".format(recall))
print("Precision: {}".format(precision))
print("f1: {}".format(f1))
print("Confusion matrix: {}".format(confusion_matrix))




Recall: 0.7066782561342144
Precision: 0.8182865458073122
f1: 0.7941797396728437
Confusion matrix: DenseMatrix([[98417., 14985.],
             [28009., 67480.]])
