In [23]:
!pip install pyspark

In [24]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("LogisticRegression").getOrCreate()

In [25]:
data = spark.read.csv(path = "/kaggle/input/pyspark-ml-logistic-regression/customer_churn.csv", 
                    inferSchema = True, header = True)

data.show()

# Predecir la columna Churn

In [26]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer

from pyspark.ml.classification import LogisticRegression

In [27]:
from pyspark.ml import Pipeline

data = spark.read.csv(path = "/kaggle/input/pyspark-ml-logistic-regression/customer_churn.csv",
                      inferSchema = True, header = True)

data = data.na.drop()

company_indexer = StringIndexer(inputCol = "Company", outputCol = "CompanyIndex", handleInvalid = "keep")
company_encoder = OneHotEncoder(inputCol = "CompanyIndex", outputCol = "CompanyVec")

assembler = VectorAssembler(inputCols = ["Age", "Total_Purchase", "Account_Manager", "Years", "Num_Sites", "CompanyVec"],
                            outputCol = "features")

log_reg = LogisticRegression(featuresCol = "features",
                             labelCol = "Churn",
                             predictionCol = "prediction")

In [28]:
pipeline = Pipeline(stages = [company_indexer, company_encoder, assembler, log_reg])

In [29]:
train, test = data.randomSplit(weights = [0.75, 0.25], seed = 42)

In [30]:
model = pipeline.fit(train)

In [31]:
y_hat = model.transform(test)

In [32]:
y_hat.select("Churn", "prediction").show()

In [33]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol = "prediction",
                                          labelCol = "Churn")

In [34]:
# Con parametros
evaluator.evaluate(dataset = y_hat, params = {evaluator.metricName: "areaUnderROC"})

In [35]:
# Area Under Precision-Recall, esta metrica puede resultar util cuando las clases estan desbalanceadas
evaluator.evaluate(dataset = y_hat, params = {evaluator.metricName: "areaUnderPR"})

In [36]:
#############################################################################################################################################################