In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://mirrors.sonic.net/apache/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xzf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"


import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
df = spark.read.format('csv').options(inferSchema='True',header='True').load('/content/drive/MyDrive/Colab Notebooks/Churn/customer_churn.csv')

In [None]:
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [None]:
df.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|       Onboard_date|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|               null|                null|                null|0.16666666666666666|
| stddev| 

In [None]:
df.head(1)

[Row(Names='Cameron Williams', Age=42.0, Total_Purchase=11066.8, Account_Manager=0, Years=7.22, Num_Sites=8.0, Onboard_date='2013-08-30 07:00:40', Location='10265 Elizabeth Mission Barkerburgh, AK 89518', Company='Harvey LLC', Churn=1)]

In [None]:
df.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [None]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Age',
                                       'Total_Purchase',
                                       'Account_Manager',
                                       'Years',
                                       'Num_Sites'],
                           outputCol='features')

In [None]:
output = assembler.transform(df)

In [None]:
final_data = output.select(['features','churn'])
train,test = final_data.randomSplit([0.7, 0.3])

In [None]:
train.describe().show()

+-------+-------------------+
|summary|              churn|
+-------+-------------------+
|  count|                618|
|   mean|0.18608414239482202|
| stddev| 0.3894898038886487|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol='churn')
model = lr.fit(train)

In [None]:
result = model.evaluate(test)

In [None]:
result.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[25.0,9672.03,0.0...|    0|[4.68312517869852...|[0.99083471882980...|       0.0|
|[26.0,8939.61,0.0...|    0|[6.54369505911062...|[0.99856290732991...|       0.0|
|[27.0,8628.8,1.0,...|    0|[5.54032372839861...|[0.99609009147577...|       0.0|
|[29.0,9378.24,0.0...|    0|[4.82315411783388...|[0.99202276438602...|       0.0|
|[29.0,12711.15,0....|    0|[5.51355168789103...|[0.99598442747195...|       0.0|
|[30.0,6744.87,0.0...|    0|[3.49207203615925...|[0.97046135098869...|       0.0|
|[30.0,10183.98,1....|    0|[2.69672000680170...|[0.93683282138731...|       0.0|
|[30.0,10744.14,1....|    1|[1.57176800510430...|[0.82803550532607...|       0.0|
|[30.0,10960.52,1....|    0|[2.20394254803101...|[0.90060299473433...|       0.0|
|[30.0,12788.37,

In [None]:
result.areaUnderROC

0.8581839213418176

In [None]:
result.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              churn|         prediction|
+-------+-------------------+-------------------+
|  count|                282|                282|
|   mean|0.12411347517730496|0.10283687943262411|
| stddev| 0.3302971183129155| 0.3042857016746074|
|    min|                  0|                0.0|
|    max|                  1|                1.0|
+-------+-------------------+-------------------+



In [None]:
result.predictions.select('churn','prediction').show()

+-----+----------+
|churn|prediction|
+-----+----------+
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       1.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
+-----+----------+
only showing top 20 rows



In [None]:
new_customers = spark.read.format('csv').options(inferSchema='True',header='True').load('/content/drive/MyDrive/Colab Notebooks/Churn/new_customers.csv')

In [None]:
new_customers.show()

+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|         Company|
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:54|38612 Johnny Stra...|        King Ltd|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:54|21083 Nicole Junc...|   Cannon-Benson|
|  Jeremy Chang|65.0|         100.0|              1|  1.0|     15.0|2006-12-11 07:48:13|085 Austin Views ...|Barron-Robertson|
|Megan Ferguson|32.0|        6487.5|              0|  9.4|     14.0|2016-10-28 05:32:13|922 Wright Branch...|   Sexton-Golden|
|  Taylor Young|32.0|      13147.71|              1| 10.0|      8.0|2012-03-20 00:36:46|Unit 0789 Box 073...|  

In [None]:
results = model.transform(assembler.transform(new_customers))

In [None]:
results.select('Names', 'Company', 'prediction').show()

+--------------+----------------+----------+
|         Names|         Company|prediction|
+--------------+----------------+----------+
| Andrew Mccall|        King Ltd|       0.0|
|Michele Wright|   Cannon-Benson|       1.0|
|  Jeremy Chang|Barron-Robertson|       1.0|
|Megan Ferguson|   Sexton-Golden|       1.0|
|  Taylor Young|        Wood LLC|       0.0|
| Jessica Drake|   Parks-Robbins|       1.0|
+--------------+----------------+----------+

