## Importing the libraries

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.\
        builder.\
        appName('Churn_Data').\
        getOrCreate()

## Preparing the data

### Importing the data

In [3]:
df = spark.read.csv('customer_churn.csv', inferSchema=True, header=True)
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [4]:
for item in df.head(1)[0]:
    print(item)

Cameron Williams
42.0
11066.8
0
7.22
8.0
2013-08-30 07:00:40
10265 Elizabeth Mission Barkerburgh, AK 89518
Harvey LLC
1


In [5]:
df.groupBy('Company').count().show()

+--------------------+-----+
|             Company|count|
+--------------------+-----+
|Miller, Johnson a...|    1|
|Hunter, Reyes and...|    1|
|          Obrien PLC|    1|
|            Soto PLC|    2|
|            Todd LLC|    1|
|Smith, Marshall a...|    1|
|           Smith PLC|    1|
|          Hall Group|    1|
|Freeman, Lam and ...|    1|
|       Smith-Carroll|    1|
|Hall, Hernandez a...|    1|
|          Cannon Inc|    1|
|        White-Dennis|    1|
|Wilson, Collins a...|    1|
|Jennings, Gates a...|    1|
|     Campbell-Willis|    1|
|    Martinez-Roberts|    1|
|        Robinson PLC|    1|
|          Barton Inc|    1|
|Hernandez, Middle...|    1|
+--------------------+-----+
only showing top 20 rows



In [6]:
df.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

### Selecting Columns

In [7]:
cols = ['Age', 'Total_Purchase', 'Years', 'Num_Sites', 'Churn']
data = df.select(cols).na.drop()

In [8]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=data.columns[:-1], outputCol='features').transform(data)
data = assembler.select('features', 'Churn')

In [9]:
data.show(5)

+--------------------+-----+
|            features|Churn|
+--------------------+-----+
|[42.0,11066.8,7.2...|    1|
|[41.0,11916.22,6....|    1|
|[38.0,12884.75,6....|    1|
|[42.0,8010.76,6.7...|    1|
|[37.0,9191.58,5.5...|    1|
+--------------------+-----+
only showing top 5 rows



### Splitting the data

In [10]:
train_data, test_data = data.randomSplit([0.7, 0.3])

## Fitting and predicting with Logistic Regression

In [11]:
from pyspark.ml.classification import LogisticRegression
model = LogisticRegression(featuresCol='features', labelCol='Churn').fit(train_data)
results = model.evaluate(test_data)

In [12]:
print('Area Under ROC score: {:.3f}'.format(results.areaUnderROC))

Area Under ROC score: 0.863
