### A marketing agency has many customers that use their service to produce ads for the client/customer websites. They've noticed that they have quite a bit of churn in clients. They basically randomly assign account managers right now. Thus, predicting which customers will Churn(stop buying the service), which will help the agency to assign them an account manager.

### Initialize and create a spark session

In [1]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().appName("Churn").getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://Varun-CK:4040
SparkContext available as 'sc' (version = 2.3.0, master = local[*], app id = local-1577616043944)
SparkSession available as 'spark'


2019-12-29 16:10:38 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2019-12-29 16:10:59 WARN  SparkContext:66 - Using an existing SparkContext; some configuration may not take effect.


import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@2fe267d3


### Initialize Logger

In [2]:
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

import org.apache.log4j._


### Import statements to setup ML for Logistic Regression

In [3]:
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{StringIndexer,VectorAssembler,OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors

import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors


### Using Spark to read the customer churn data set

In [4]:
val data = spark.read.options(Map(("header","true"),("inferSchema","true"))).csv("customer_churn.csv")

data: org.apache.spark.sql.DataFrame = [Names: string, Age: double ... 8 more fields]


### Printing the first row of the dataframe

In [5]:
data.head(1)

res1: Array[org.apache.spark.sql.Row] = Array([Cameron Williams,42.0,11066.8,0,7.22,8.0,2013-08-30 07:00:40.0,10265 Elizabeth Mission Barkerburgh, AK 89518,Harvey LLC,1])


### Printing the schema of the dataframe

In [6]:
data.printSchema

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



### Show

In [7]:
data.show(5)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|  Cynthia Norton|37.0|    

### Count

In [8]:
data.count

res4: Long = 900


### Count by dropping duplicates

In [9]:
data.na.drop().count

res5: Long = 900


### Checking out whether the string columns "Names", "Location" and "Company"are useful or not (to check whether they are categorical columns or not)

In [11]:
data.groupBy("Names").count().count()

res7: Long = 899


In [12]:
data.groupBy("Location").count().count()

res8: Long = 900


In [13]:
data.groupBy("Company").count().count()

res9: Long = 873


### Ignoring the categorical columns since they are not useful and checking out timestamp column "Onboard_date"

In [14]:
data.select("Onboard_date").show(3)

+-------------------+
|       Onboard_date|
+-------------------+
|2013-08-30 07:00:40|
|2013-08-13 00:38:46|
|2016-06-29 06:20:07|
+-------------------+
only showing top 3 rows



### Checking out whether `Year` is useful or not in the column `Onboard_date`

In [16]:
data.groupBy(year($"Onboard_date")).count().count()

res12: Long = 11


### Creating a new column `Onboard_Year`

In [17]:
var filtered_data = data.withColumn("Onboard_Year",year($"Onboard_date"))

filtered_data: org.apache.spark.sql.DataFrame = [Names: string, Age: double ... 9 more fields]


In [18]:
filtered_data.printSchema

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)
 |-- Onboard_Year: integer (nullable = true)



In [19]:
filtered_data = filtered_data.drop("Names","Location","Company","Onboard_date")

filtered_data: org.apache.spark.sql.DataFrame = [Age: double, Total_Purchase: double ... 5 more fields]


In [20]:
filtered_data.show(5)

+----+--------------+---------------+-----+---------+-----+------------+
| Age|Total_Purchase|Account_Manager|Years|Num_Sites|Churn|Onboard_Year|
+----+--------------+---------------+-----+---------+-----+------------+
|42.0|       11066.8|              0| 7.22|      8.0|    1|        2013|
|41.0|      11916.22|              0|  6.5|     11.0|    1|        2013|
|38.0|      12884.75|              0| 6.67|     12.0|    1|        2016|
|42.0|       8010.76|              0| 6.71|     10.0|    1|        2014|
|37.0|       9191.58|              0| 5.56|      9.0|    1|        2016|
+----+--------------+---------------+-----+---------+-----+------------+
only showing top 5 rows



In [21]:
filtered_data.printSchema

root
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Churn: integer (nullable = true)
 |-- Onboard_Year: integer (nullable = true)



### Assembling all the features to a single vector column "features"

In [22]:
val assembler = new VectorAssembler().setInputCols(Array("Age","Total_Purchase","Account_Manager","Years","Num_Sites"
                                                         ,"Onboard_Year")).setOutputCol("features")

assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_e9e72f5486d1


In [23]:
val output = assembler.transform(filtered_data)

output: org.apache.spark.sql.DataFrame = [Age: double, Total_Purchase: double ... 6 more fields]


In [24]:
val final_data = output.select("Churn","features")

final_data: org.apache.spark.sql.DataFrame = [Churn: int, features: vector]


In [25]:
final_data.show(5,false)

+-----+------------------------------------+
|Churn|features                            |
+-----+------------------------------------+
|1    |[42.0,11066.8,0.0,7.22,8.0,2013.0]  |
|1    |[41.0,11916.22,0.0,6.5,11.0,2013.0] |
|1    |[38.0,12884.75,0.0,6.67,12.0,2016.0]|
|1    |[42.0,8010.76,0.0,6.71,10.0,2014.0] |
|1    |[37.0,9191.58,0.0,5.56,9.0,2016.0]  |
+-----+------------------------------------+
only showing top 5 rows



### Splitting the resultant data into training data and testing data,

<code>
<b>Training data is to train the model</b>
<b>Testing data is to test the builted model</b>
</code>

In [28]:
val Array(train_data,test_data) = final_data.randomSplit(Array(0.7,0.3))

train_data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Churn: int, features: vector]
test_data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Churn: int, features: vector]


In [29]:
final_data.describe().show()

+-------+-------------------+
|summary|              Churn|
+-------+-------------------+
|  count|                900|
|   mean|0.16666666666666666|
| stddev| 0.3728852122772358|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



In [30]:
train_data.describe().show()

+-------+-------------------+
|summary|              Churn|
+-------+-------------------+
|  count|                633|
|   mean|0.16587677725118483|
| stddev| 0.3722641518010831|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



In [31]:
test_data.describe().show()

+-------+-------------------+
|summary|              Churn|
+-------+-------------------+
|  count|                267|
|   mean|0.16853932584269662|
| stddev| 0.3750475174760624|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



### Creating a logistic regression model object

In [32]:
val lor = new LogisticRegression().setLabelCol("Churn").setFeaturesCol("features")

lor: org.apache.spark.ml.classification.LogisticRegression = logreg_deae5c1b8799


### Creating a logistic regression model and fitting the training data to it

In [34]:
val churnModel = lor.fit(train_data)

churnModel: org.apache.spark.ml.classification.LogisticRegressionModel = logreg_deae5c1b8799


### Getting Results on Test Set

In [35]:
val results = churnModel.transform(test_data)

results: org.apache.spark.sql.DataFrame = [Churn: int, features: vector ... 3 more fields]


In [36]:
results.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|Churn|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|    0|[25.0,9672.03,0.0...|[4.85959346527196...|[0.99230602093568...|       0.0|
|    0|[27.0,8628.8,1.0,...|[5.66878959837931...|[0.99655983448822...|       0.0|
|    0|[28.0,11245.38,0....|[4.01415915798478...|[0.98226217969890...|       0.0|
|    0|[29.0,11274.46,1....|[4.55848534916637...|[0.98963073102629...|       0.0|
|    0|[29.0,13240.01,1....|[7.23236132952849...|[0.99927771003092...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



## MODEL EVALUATION

### 1) Converting the data to rdd and evaluating using MulticlassMetrics to print the confusion matrix

In [37]:
import org.apache.spark.mllib.evaluation.MulticlassMetrics

import org.apache.spark.mllib.evaluation.MulticlassMetrics


In [38]:
val clean_result = results.withColumn("Churn",results("Churn").cast("double"))

clean_result: org.apache.spark.sql.DataFrame = [Churn: double, features: vector ... 3 more fields]


In [40]:
clean_result.select("Churn","prediction").show(5)

+-----+----------+
|Churn|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
+-----+----------+
only showing top 5 rows



In [41]:
val predictionAndLabel = clean_result.select("Churn","prediction").as[(Double,Double)].rdd

predictionAndLabel: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[291] at rdd at <console>:43


In [42]:
val metrics = new MulticlassMetrics(predictionAndLabel)

metrics: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@333a1d3d


#### Printing the confusion matrix

In [43]:
println(metrics.confusionMatrix)

211.0  26.0  
11.0   19.0  


#### Printing the Accuracy

In [44]:
println(metrics.accuracy)

0.8614232209737828


#### Recall

In [45]:
println(metrics.recall)

0.8614232209737828


#### precision

In [46]:
println(metrics.precision)

0.8614232209737828


### 2) Evaluating using BinaryClassificationEvaluator

In [47]:
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator


In [48]:
val bin_eval = new BinaryClassificationEvaluator().setRawPredictionCol("rawPrediction").setLabelCol("Churn")

bin_eval: org.apache.spark.ml.evaluation.BinaryClassificationEvaluator = binEval_d1e4146087c9


#### Calculating Area Under ROC

In [49]:
val AOC =bin_eval.evaluate(results)

AOC: Double = 0.846346346346347


### 3) Evaluating using MulticlassClassificationEvaluator

In [50]:
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator


In [53]:
val multi_eval = new MulticlassClassificationEvaluator().setPredictionCol("prediction").setLabelCol("Churn")

multi_eval: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_57ee3f857af0


In [54]:
val AOC_2 = multi_eval.evaluate(results)

AOC_2: Double = 0.8498298695258378


#### Printing Area Under ROC

In [55]:
println(AOC_2)

0.8498298695258378


### Stopping the created spark session

In [56]:
spark.stop()

## Thank You!