In [1]:
import findspark
findspark.init('/home/asif/spark-2.1.0-bin-hadoop2.7')
from pyspark.sql import SparkSession # to load dataframe we need this
spark = SparkSession.builder.appName('LogRegChurnDetais').getOrCreate()
from pyspark.ml.classification import LogisticRegression

In [2]:
data = spark.read.csv('ChurnData.txt', inferSchema=True, header=True,sep=',' )

In [3]:
data.printSchema()

root
 |-- State: string (nullable = true)
 |-- Account Length: integer (nullable = true)
 |-- Area Code: integer (nullable = true)
 |-- Phone: string (nullable = true)
 |-- Int'l Plan: string (nullable = true)
 |-- VMail Plan: string (nullable = true)
 |-- VMail Message: integer (nullable = true)
 |-- Day Mins: double (nullable = true)
 |-- Day Calls: integer (nullable = true)
 |-- Day Charge: double (nullable = true)
 |-- Eve Mins: double (nullable = true)
 |-- Eve Calls: integer (nullable = true)
 |-- Eve Charge: double (nullable = true)
 |-- Night Mins: double (nullable = true)
 |-- Night Calls: integer (nullable = true)
 |-- Night Charge: double (nullable = true)
 |-- Intl Mins: double (nullable = true)
 |-- Intl Calls: integer (nullable = true)
 |-- Intl Charge: double (nullable = true)
 |-- CustServ Calls: integer (nullable = true)
 |-- Churn?: string (nullable = true)



In [4]:
print('number of columns = ',len(data.columns))# how many columns
data.columns    # list of names of the column 

number of columns =  21


['State',
 'Account Length',
 'Area Code',
 'Phone',
 "Int'l Plan",
 'VMail Plan',
 'VMail Message',
 'Day Mins',
 'Day Calls',
 'Day Charge',
 'Eve Mins',
 'Eve Calls',
 'Eve Charge',
 'Night Mins',
 'Night Calls',
 'Night Charge',
 'Intl Mins',
 'Intl Calls',
 'Intl Charge',
 'CustServ Calls',
 'Churn?']

In [5]:

import re
data = data.toDF(*(re.sub(r'[\.\s]+', '_', c) for c in data.columns))
data.columns

['State',
 'Account_Length',
 'Area_Code',
 'Phone',
 "Int'l_Plan",
 'VMail_Plan',
 'VMail_Message',
 'Day_Mins',
 'Day_Calls',
 'Day_Charge',
 'Eve_Mins',
 'Eve_Calls',
 'Eve_Charge',
 'Night_Mins',
 'Night_Calls',
 'Night_Charge',
 'Intl_Mins',
 'Intl_Calls',
 'Intl_Charge',
 'CustServ_Calls',
 'Churn?']

In [6]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
func =  udf (lambda x: 0 if x=='False.' else 1, IntegerType())
data = data.withColumn('new_Churn', func(col('Churn?')))

data.printSchema()

root
 |-- State: string (nullable = true)
 |-- Account_Length: integer (nullable = true)
 |-- Area_Code: integer (nullable = true)
 |-- Phone: string (nullable = true)
 |-- Int'l_Plan: string (nullable = true)
 |-- VMail_Plan: string (nullable = true)
 |-- VMail_Message: integer (nullable = true)
 |-- Day_Mins: double (nullable = true)
 |-- Day_Calls: integer (nullable = true)
 |-- Day_Charge: double (nullable = true)
 |-- Eve_Mins: double (nullable = true)
 |-- Eve_Calls: integer (nullable = true)
 |-- Eve_Charge: double (nullable = true)
 |-- Night_Mins: double (nullable = true)
 |-- Night_Calls: integer (nullable = true)
 |-- Night_Charge: double (nullable = true)
 |-- Intl_Mins: double (nullable = true)
 |-- Intl_Calls: integer (nullable = true)
 |-- Intl_Charge: double (nullable = true)
 |-- CustServ_Calls: integer (nullable = true)
 |-- Churn?: string (nullable = true)
 |-- new_Churn: integer (nullable = true)



In [7]:
data.head(1)

[Row(State='KS', Account_Length=128, Area_Code=415, Phone='382-4657', Int'l_Plan='no', VMail_Plan='yes', VMail_Message=25, Day_Mins=265.1, Day_Calls=110, Day_Charge=45.07, Eve_Mins=197.4, Eve_Calls=99, Eve_Charge=16.78, Night_Mins=244.7, Night_Calls=91, Night_Charge=11.01, Intl_Mins=10.0, Intl_Calls=3, Intl_Charge=2.7, CustServ_Calls=1, Churn?='False.', new_Churn=0)]

In [8]:
func =  udf (lambda x: 0 if x=='no' else 1, IntegerType())
data = data.withColumn('new_IntlPlan', func(col('Int\'l_Plan')))

data.printSchema()

root
 |-- State: string (nullable = true)
 |-- Account_Length: integer (nullable = true)
 |-- Area_Code: integer (nullable = true)
 |-- Phone: string (nullable = true)
 |-- Int'l_Plan: string (nullable = true)
 |-- VMail_Plan: string (nullable = true)
 |-- VMail_Message: integer (nullable = true)
 |-- Day_Mins: double (nullable = true)
 |-- Day_Calls: integer (nullable = true)
 |-- Day_Charge: double (nullable = true)
 |-- Eve_Mins: double (nullable = true)
 |-- Eve_Calls: integer (nullable = true)
 |-- Eve_Charge: double (nullable = true)
 |-- Night_Mins: double (nullable = true)
 |-- Night_Calls: integer (nullable = true)
 |-- Night_Charge: double (nullable = true)
 |-- Intl_Mins: double (nullable = true)
 |-- Intl_Calls: integer (nullable = true)
 |-- Intl_Charge: double (nullable = true)
 |-- CustServ_Calls: integer (nullable = true)
 |-- Churn?: string (nullable = true)
 |-- new_Churn: integer (nullable = true)
 |-- new_IntlPlan: integer (nullable = true)



In [9]:
data.describe("new_IntlPlan").show()

+-------+------------------+
|summary|      new_IntlPlan|
+-------+------------------+
|  count|              3333|
|   mean|0.0969096909690969|
| stddev|0.2958791454844147|
|    min|                 0|
|    max|                 1|
+-------+------------------+



In [10]:
data = data.drop('Churn?','Int\'l_Plan') 

In [11]:
func =  udf (lambda x: 0 if x=='no' else 1, IntegerType())
data = data.withColumn('new_VMailPlan', func(col('VMail_Plan')))
data = data.drop('VMail Plan') 
data.printSchema()

root
 |-- State: string (nullable = true)
 |-- Account_Length: integer (nullable = true)
 |-- Area_Code: integer (nullable = true)
 |-- Phone: string (nullable = true)
 |-- VMail_Plan: string (nullable = true)
 |-- VMail_Message: integer (nullable = true)
 |-- Day_Mins: double (nullable = true)
 |-- Day_Calls: integer (nullable = true)
 |-- Day_Charge: double (nullable = true)
 |-- Eve_Mins: double (nullable = true)
 |-- Eve_Calls: integer (nullable = true)
 |-- Eve_Charge: double (nullable = true)
 |-- Night_Mins: double (nullable = true)
 |-- Night_Calls: integer (nullable = true)
 |-- Night_Charge: double (nullable = true)
 |-- Intl_Mins: double (nullable = true)
 |-- Intl_Calls: integer (nullable = true)
 |-- Intl_Charge: double (nullable = true)
 |-- CustServ_Calls: integer (nullable = true)
 |-- new_Churn: integer (nullable = true)
 |-- new_IntlPlan: integer (nullable = true)
 |-- new_VMailPlan: integer (nullable = true)



In [12]:
data.describe("new_VMailPlan").show()

+-------+-------------------+
|summary|      new_VMailPlan|
+-------+-------------------+
|  count|               3333|
|   mean|0.27662766276627665|
| stddev| 0.4473978703800645|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



In [13]:
data.columns

['State',
 'Account_Length',
 'Area_Code',
 'Phone',
 'VMail_Plan',
 'VMail_Message',
 'Day_Mins',
 'Day_Calls',
 'Day_Charge',
 'Eve_Mins',
 'Eve_Calls',
 'Eve_Charge',
 'Night_Mins',
 'Night_Calls',
 'Night_Charge',
 'Intl_Mins',
 'Intl_Calls',
 'Intl_Charge',
 'CustServ_Calls',
 'new_Churn',
 'new_IntlPlan',
 'new_VMailPlan']

In [14]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols =['Account_Length','Area_Code','Day_Mins','Day_Calls','Day_Charge','Eve_Mins',
                                      'Eve_Calls','Eve_Charge','Night_Mins','Night_Calls','Night_Charge',
                                      'Intl_Mins','Intl_Calls','Intl_Charge','CustServ_Calls','new_IntlPlan','new_VMailPlan'],
                            outputCol = 'features')
# VectorAssembler convert all the inputCols to a dense vector fro each row and output it as 'features'
# we have to do this to satisfy the format which spark mllib deals with

In [15]:
dataFromAssembler = assembler.transform(data)
dataFromAssembler.head(1)

[Row(State='KS', Account_Length=128, Area_Code=415, Phone='382-4657', VMail_Plan='yes', VMail_Message=25, Day_Mins=265.1, Day_Calls=110, Day_Charge=45.07, Eve_Mins=197.4, Eve_Calls=99, Eve_Charge=16.78, Night_Mins=244.7, Night_Calls=91, Night_Charge=11.01, Intl_Mins=10.0, Intl_Calls=3, Intl_Charge=2.7, CustServ_Calls=1, new_Churn=0, new_IntlPlan=0, new_VMailPlan=1, features=DenseVector([128.0, 415.0, 265.1, 110.0, 45.07, 197.4, 99.0, 16.78, 244.7, 91.0, 11.01, 10.0, 3.0, 2.7, 1.0, 0.0, 1.0]))]

In [16]:
final_data = dataFromAssembler.select('features','new_Churn')
final_data.head(1)

[Row(features=DenseVector([128.0, 415.0, 265.1, 110.0, 45.07, 197.4, 99.0, 16.78, 244.7, 91.0, 11.01, 10.0, 3.0, 2.7, 1.0, 0.0, 1.0]), new_Churn=0)]

In [17]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [18]:
final_data.describe().show()

+-------+-------------------+
|summary|          new_Churn|
+-------+-------------------+
|  count|               3333|
|   mean|0.14491449144914492|
| stddev|  0.352067423624126|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



In [19]:
log_reg_model = LogisticRegression(labelCol='new_Churn')


In [20]:
trained_model = log_reg_model.fit(train_data)


In [21]:
training_sum = trained_model.summary

In [22]:
training_sum.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|          new_Churn|         prediction|
+-------+-------------------+-------------------+
|  count|               2337|               2337|
|   mean|0.14505776636713735|0.05648267008985879|
| stddev| 0.3522344387328226|0.23090082630295647|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [23]:
predict_and_labels = trained_model.evaluate(test_data)

In [24]:
predict_and_labels.predictions.show()

+--------------------+---------+--------------------+--------------------+----------+
|            features|new_Churn|       rawPrediction|         probability|prediction|
+--------------------+---------+--------------------+--------------------+----------+
|[1.0,415.0,144.8,...|        0|[3.20708540079934...|[0.96110004447509...|       0.0|
|[2.0,415.0,132.1,...|        1|[1.01183195591423...|[0.73337851288136...|       0.0|
|[3.0,408.0,139.0,...|        0|[1.24530728838788...|[0.77648647337131...|       0.0|
|[3.0,510.0,161.0,...|        0|[1.09786025059777...|[0.74985896635279...|       0.0|
|[10.0,415.0,222.2...|        0|[1.99523086253162...|[0.88029543907341...|       0.0|
|[11.0,408.0,131.5...|        0|[3.03243841536968...|[0.95401825845598...|       0.0|
|[11.0,415.0,190.6...|        0|[2.56912574398327...|[0.92884793847846...|       0.0|
|[13.0,415.0,143.1...|        0|[2.61447162429984...|[0.93178716138031...|       0.0|
|[13.0,415.0,193.2...|        0|[0.86345833678629...|[

In [25]:
from pyspark.ml.evaluation import (BinaryClassificationEvaluator, MulticlassClassificationEvaluator)
bin_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='new_Churn')
accuracy = bin_eval.evaluate(predict_and_labels.predictions)
accuracy

0.5766823161189358