In [57]:
!pip install pyspark

In [58]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("LinearRegression").getOrCreate()

In [59]:
data = spark.read.csv(path = "../input/logistic-regression/customer_churn.csv", 
                    inferSchema = True, header = True)

data.show()

# Predecir la columna Churn

In [60]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer

from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [61]:
data.printSchema()

In [62]:
data = data.select(["Names", "Age", "Total_Purchase", "Account_Manager", "Years", "Num_Sites", "Onboard_date", "Location", "Company", "Churn"])
data = data.na.drop()

In [63]:
data.show()

In [64]:
company_indexer = StringIndexer(inputCol = "Company", outputCol = "CompanyIndex")
company_encoder = OneHotEncoder(inputCol = "CompanyIndex", outputCol = "CompanyVec")

In [65]:
data = company_indexer.fit(data).transform(data)
data = company_encoder.fit(data).transform(data)

data.show(5)

In [66]:
# Las columnas Names y Location no serán utilizadas, Company y CompanyIndex tampoco (para eso está CompanyVec,
# y Onboard_date no se usará por ser redundante con Years).

assembler = VectorAssembler(inputCols = ["Age", "Total_Purchase", "Account_Manager", "Years", "Num_Sites", "CompanyVec"],
                            outputCol = "features")

In [67]:
output = assembler.transform(data)
output.show(5)

In [68]:
train, test = output.randomSplit(weights = [0.7, 0.3], seed = 42)

In [69]:
log_reg = LogisticRegression(featuresCol = "features",
                             labelCol = "Churn",
                             predictionCol = "prediction")

In [70]:
model = log_reg.fit(train)

In [71]:
y_hat = model.evaluate(test)

y_hat.predictions.select("Churn", "prediction").show()

In [72]:
y_hat.areaUnderROC

In [73]:
data = spark.read.csv(path = "../input/logistic-regression/customer_churn.csv",
                      inferSchema = True, header = True)

data = data.na.drop()

company_indexer = StringIndexer(inputCol = "Company", outputCol = "CompanyIndex", handleInvalid = "keep")
company_encoder = OneHotEncoder(inputCol = "CompanyIndex", outputCol = "CompanyVec")

assembler = VectorAssembler(inputCols = ["Age", "Total_Purchase", "Account_Manager", "Years", "Num_Sites", "CompanyVec"],
                            outputCol = "features")

log_reg = LogisticRegression(featuresCol = "features",
                             labelCol = "Churn",
                             predictionCol = "prediction")

In [74]:
pipeline = Pipeline(stages = [company_indexer, company_encoder, assembler, log_reg])

In [75]:
train, test = data.randomSplit(weights = [0.7, 0.3], seed = 42)

In [76]:
model = pipeline.fit(train)

In [77]:
y_hat = model.transform(test)

In [78]:
y_hat.select("Churn", "prediction").show()

In [79]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "Churn")

evaluator.metricName

In [80]:
evaluator.evaluate(dataset = y_hat)

In [81]:
evaluator.evaluate(dataset = y_hat, params = {evaluator.metricName: "areaUnderROC"})

In [82]:
evaluator.evaluate(dataset = y_hat, params = {evaluator.metricName: "areaUnderPR"})