In [126]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://mirrors.sonic.net/apache/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xzf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"


import findspark
findspark.init() #instanciar spark session. Es conveniente tener una unica sesion iniciada y actualizada (si existe la utiliza y si no la crea).
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate() #usamos todos los procesadores locales: local[*]

In [127]:
spark

In [128]:
#Cargar el archivo
data = spark.read.format("csv").load("customer_churn.csv", inferSchema=True, header=True)

In [129]:
#Visualizar info
data.show()
data.columns

+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|              Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|   Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|      Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|        Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|      Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|     

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [132]:
#Seleccionamos las columnas que nos interesan. Yo he cogido las m치s representativas (menos valores repetidos tienen y m치s relacion tienen con Churn)
mycols = data.select(['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites', 'Churn'])

In [133]:
#Eliminamos los null de las columnas y creamos el set de datos final
final_data = mycols.na.drop()

In [134]:
final_data.show()

+----+--------------+---------------+-----+---------+-----+
| Age|Total_Purchase|Account_Manager|Years|Num_Sites|Churn|
+----+--------------+---------------+-----+---------+-----+
|42.0|       11066.8|              0| 7.22|      8.0|    1|
|41.0|      11916.22|              0|  6.5|     11.0|    1|
|38.0|      12884.75|              0| 6.67|     12.0|    1|
|42.0|       8010.76|              0| 6.71|     10.0|    1|
|37.0|       9191.58|              0| 5.56|      9.0|    1|
|48.0|      10356.02|              0| 5.12|      8.0|    1|
|44.0|      11331.58|              1| 5.23|     11.0|    1|
|32.0|       9885.12|              1| 6.92|      9.0|    1|
|43.0|       14062.6|              1| 5.46|     11.0|    1|
|40.0|       8066.94|              1| 7.11|     11.0|    1|
|30.0|      11575.37|              1| 5.22|      8.0|    1|
|45.0|       8771.02|              1| 6.64|     11.0|    1|
|45.0|       8988.67|              1| 4.84|     11.0|    1|
|40.0|       8283.32|              1|  5

In [135]:
#Creamos el vectorAsembler con las columnas input (ya sin null) y la columna output que hemos llamado features
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites'], outputCol='features')

In [136]:
#Dividimos el conjunto de datos en dos, el primero para entrenar (70%) y el segundo para test (30%)
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [137]:
#Mostrar los conjuntos de datos
train_data.describe().show()
test_data.describe().show()

+-------+------------------+------------------+-------------------+------------------+-----------------+-------------------+
|summary|               Age|    Total_Purchase|    Account_Manager|             Years|        Num_Sites|              Churn|
+-------+------------------+------------------+-------------------+------------------+-----------------+-------------------+
|  count|               630|               630|                630|               630|              630|                630|
|   mean|41.768253968253966|10155.915571428563|0.46825396825396826| 5.272158730158733| 8.56031746031746|0.16031746031746033|
| stddev| 6.227045616141639| 2442.134139545334|0.49938766855589345|1.2697427002523671|1.805841051210868| 0.3671917588549127|
|    min|              22.0|             100.0|                  0|              1.62|              3.0|                  0|
|    max|              65.0|          18026.01|                  1|              8.97|             14.0|                  1|


In [138]:
#Creamos la LogisticRegresion con la columna features (conjunto de todas las columnas representativas) y la columna Churn (variable dependiente)
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol='features', labelCol='Churn')

In [139]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[assembler, lr])

In [140]:
#Mediante la funci칩n fit se crea el model a partir del dataFrame de entrenamiento
model = pipeline.fit(train_data)

In [141]:
#Hacemos la prediccion y la mostramos
predictions = model.transform(test_data)
predictions.select('Churn', 'prediction').show()

+-----+----------+
|Churn|prediction|
+-----+----------+
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       0.0|
|    1|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       0.0|
|    0|       1.0|
|    0|       0.0|
|    1|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
+-----+----------+
only showing top 20 rows



In [142]:
#Evaluacion mediante BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Churn')

In [143]:
#acc es el porcentaje de elementos que estar치n por debajo de la curva
acc = evaluator.evaluate(predictions)

In [144]:
#Resultado de la logistic regression
acc

0.7267984116723614