In [1]:
!pip install pyspark

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression

spark = SparkSession.builder.appName("LogisticRegression").getOrCreate()

In [3]:
# Load training data
data = spark.read.format("libsvm").load("../input/logistic-regression/sample_libsvm_data.txt")
# En este caso, no es necesario nombrar la X, Y, e Yhat (freatures, label, y predictions),
# ya que en los datos ya se llaman así, y por defecto son los nombres que toma el modelo.
log_reg = LogisticRegression()

# Fit the model
model = log_reg.fit(data)

summary = model.summary

In [4]:
summary.predictions.show()

In [5]:
model.evaluate(data)

In [6]:
y_hat = model.evaluate(data)

y_hat.predictions.show()

In [7]:
y_hat = y_hat.predictions.select("label", "prediction")
y_hat.show()

`Documentacion:` https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.mllib.evaluation.MulticlassMetrics.html

In [8]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [10]:
metrics = MulticlassMetrics(y_hat.rdd)

# Confusion Matrix
print("Confusion Matrix:")
print(metrics.confusionMatrix().toArray())

# Precision
print("Precision label 0")
print(metrics.precision(label = 0.0))

print("Precision label 1")
print(metrics.precision(label = 1.0))

# Recall
print("Recall label 0")
print(metrics.recall(label = 0.0))

print("Recall label 1")
print(metrics.recall(label = 1.0))

# F1-Score
print("F1-Score label 0")
print(metrics.fMeasure(label = 0.0))

print("F1-Score label 1")
print(metrics.fMeasure(label = 1.0))

# Accuracy
print("Accuracy")
print(metrics.accuracy)

# Falsos Positivos
print("Falsos positivos label 0")
print(metrics.falsePositiveRate(label = 0.0))

print("Falsos positivos label 1")
print(metrics.falsePositiveRate(label = 1.0))

## `Evaluators`

`Documentacion: Binary` https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.BinaryClassificationEvaluator

`Documentacion: MultiClass` https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.MulticlassClassificationEvaluator

In [11]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [12]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "label")

evaluator.metricName

# Por defecto la métrica es la curva ROC

In [13]:
# Sin parametros de métricas
evaluator.evaluate(dataset = y_hat)

In [14]:
# Con parámetros
evaluator.evaluate(dataset = y_hat, params = {evaluator.metricName: "areaUnderROC"})

In [15]:
# Area Under Precision-Recall, esta métrica puede resultar útil cuando las clases están desbalanceadas
evaluator.evaluate(dataset = y_hat, params = {evaluator.metricName: "areaUnderPR"})

In [16]:
# MultiClass
evaluator_m = MulticlassClassificationEvaluator(predictionCol = "prediction", labelCol = "label", metricName = "accuracy")

accuracy = evaluator.evaluate(y_hat)

accuracy

In [None]:
################################################################################################################################