In [2]:
!pip install pyspark

## LogisticRegression

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("LogisticRegression").getOrCreate()

In [5]:
data = spark.read.csv(path = "../input/logistic-regression/titanic.csv",
                      inferSchema = True, header = True)

data.printSchema()

In [6]:
data = data.select(["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"])
data = data.na.drop()

In [7]:
data.show()

In [8]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer

In [9]:
gender_indexer = StringIndexer(inputCol = "Sex", outputCol = "SexIndex")
gender_encoder = OneHotEncoder(inputCol = "SexIndex", outputCol = "SexVec")

In [10]:
data = gender_indexer.fit(data).transform(data)
data = gender_encoder.fit(data).transform(data)

data.show(5)

In [11]:
embark_indexer = StringIndexer(inputCol = "Embarked", outputCol = "EmbarkIndex")
embark_encoder = OneHotEncoder(inputCol = "EmbarkIndex", outputCol = "EmbarkVec")

In [12]:
data = embark_indexer.fit(data).transform(data)
data = embark_encoder.fit(data).transform(data)

data.show(5)

In [13]:
assembler = VectorAssembler(inputCols = ["Pclass", "SexVec", "Age", "SibSp", "Parch", "Fare", "EmbarkVec"],
                            outputCol = "features")

In [14]:
output = assembler.transform(data)
output.show(5)

In [15]:
train, test = output.randomSplit(weights = [0.7, 0.3], seed = 42)

In [16]:
from pyspark.ml.classification import LogisticRegression

log_reg = LogisticRegression(featuresCol = "features",
                             labelCol = "Survived",
                             predictionCol = "prediction")

In [17]:
model = log_reg.fit(train)

In [18]:
y_hat = model.evaluate(test)

y_hat.predictions.select("Survived", "prediction").show()

In [19]:
y_hat.areaUnderROC

## `Pipeline`

In [20]:
from pyspark.ml import Pipeline


data = spark.read.csv(path = "../input/logistic-regression/titanic.csv",
                      inferSchema = True, header = True)

data = data.na.drop()

gender_indexer = StringIndexer(inputCol = "Sex", outputCol = "SexIndex")
gender_encoder = OneHotEncoder(inputCol = "SexIndex", outputCol = "SexVec")

embark_indexer = StringIndexer(inputCol = "Embarked", outputCol = "EmbarkIndex")
embark_encoder = OneHotEncoder(inputCol = "EmbarkIndex", outputCol = "EmbarkVec")

assembler = VectorAssembler(inputCols = ["Pclass", "SexVec", "Age", "SibSp", "Parch", "Fare", "EmbarkVec"],
                            outputCol = "features")

log_reg = LogisticRegression(featuresCol = "features",
                             labelCol = "Survived",
                             predictionCol = "prediction")

In [21]:
pipeline = Pipeline(stages = [gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler, log_reg])

In [22]:
train, test = data.randomSplit(weights = [0.7, 0.3], seed = 42)

In [23]:
model = pipeline.fit(train)

In [24]:
y_hat = model.transform(test)

In [25]:
y_hat.select("Survived", "prediction").show()

In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "Survived")

evaluator.metricName

# Por defecto la métrica es la curva ROC

In [27]:
# Sin parámetros de métricas
evaluator.evaluate(dataset = y_hat)

In [28]:
# Con parámetros
evaluator.evaluate(dataset = y_hat, params = {evaluator.metricName: "areaUnderROC"})

In [29]:
# Area Under Precision-Recall, esta métrica puede resultar útil cuando las clases estan desbalanceadas
evaluator.evaluate(dataset = y_hat, params = {evaluator.metricName: "areaUnderPR"})

In [30]:
################################################################################################################################