In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 28 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 50.8 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=615b64e1f877faac1f191fff65618a26e02344c9e2e0d462ebd1088f411f65c9
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1
/content


In [170]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('customer').getOrCreate()

In [171]:
df = spark.read.csv('/content/customer_churn.csv', inferSchema=True, header=True)

In [172]:
df.printSchema()
df.show(10)

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|   Kevin Mueller|41.0|      1191

In [173]:
print(f"Total Dataset: {df.count()}")
print("------------------")
print(f"Companies: {df.select('Company').distinct().count()}")
print(f"Locations: {df.select('Location').distinct().count()}")

Total Dataset: 900
------------------
Companies: 873
Locations: 900


In [174]:
df.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [175]:
data = df.select(['Age',
  'Total_Purchase',
  'Account_Manager',
  'Years',
  'Num_Sites',
  'Company',
  'Churn'])

In [178]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

companies_indexer = StringIndexer(inputCol='Company', outputCol='CompanyIndex')
indx_data = companies_indexer.fit(data).transform(data)
indx_data.show(5)

+----+--------------+---------------+-----+---------+--------------------+-----+------------+
| Age|Total_Purchase|Account_Manager|Years|Num_Sites|             Company|Churn|CompanyIndex|
+----+--------------+---------------+-----+---------+--------------------+-----+------------+
|42.0|       11066.8|              0| 7.22|      8.0|          Harvey LLC|    1|       343.0|
|41.0|      11916.22|              0|  6.5|     11.0|          Wilson PLC|    1|         2.0|
|38.0|      12884.75|              0| 6.67|     12.0|Miller, Johnson a...|    1|       515.0|
|42.0|       8010.76|              0| 6.71|     10.0|           Smith Inc|    1|        14.0|
|37.0|       9191.58|              0| 5.56|      9.0|          Love-Jones|    1|       474.0|
+----+--------------+---------------+-----+---------+--------------------+-----+------------+
only showing top 5 rows



In [179]:
assembler = VectorAssembler(inputCols=['Age','Total_Purchase','Account_Manager','Years','Num_Sites','CompanyIndex'],
                            outputCol='features')

In [184]:
output = assembler.transform(indx_data)
final_data = output.select(['features','Churn'])
final_data.show(5)

+--------------------+-----+
|            features|Churn|
+--------------------+-----+
|[42.0,11066.8,0.0...|    1|
|[41.0,11916.22,0....|    1|
|[38.0,12884.75,0....|    1|
|[42.0,8010.76,0.0...|    1|
|[37.0,9191.58,0.0...|    1|
+--------------------+-----+
only showing top 5 rows



In [185]:
train, test = final_data.randomSplit([0.7,0.3])

In [186]:
from pyspark.ml.classification import LogisticRegression
log_reg = LogisticRegression(featuresCol='features', labelCol='Churn')

In [187]:
log_reg_model = log_reg.fit(train)

In [188]:

results = log_reg_model.evaluate(test)
results.predictions.show()



+--------------------+-----+--------------------+--------------------+----------+
|            features|Churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[28.0,8670.98,0.0...|    0|[7.33216709493068...|[0.99934627320308...|       0.0|
|[28.0,11245.38,0....|    0|[3.60529544378857...|[0.97353976016561...|       0.0|
|[29.0,13255.05,1....|    0|[4.02061497673755...|[0.98237431115662...|       0.0|
|[30.0,8874.83,0.0...|    0|[2.87079586838110...|[0.94638374593891...|       0.0|
|[30.0,10744.14,1....|    1|[1.40891568031500...|[0.80359486162076...|       0.0|
|[30.0,10960.52,1....|    0|[2.13369537210887...|[0.89413531228753...|       0.0|
|[31.0,8688.21,0.0...|    0|[6.29931351353686...|[0.99816580464647...|       0.0|
|[31.0,8829.83,1.0...|    0|[3.95531066113145...|[0.98120721527866...|       0.0|
|[31.0,10058.87,1....|    0|[4.01059499089660...|[0.98219997368064...|       0.0|
|[31.0,12264.68,

In [194]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Churn')
auc = eval.evaluate(results.predictions) #AUC Curve
print(f"\nAUC: {auc}")




AUC: 0.7265963203463204


# Testing new_customers.csv data

In [195]:
new_data = spark.read.csv('/content/new_customers.csv', inferSchema=True, header=True)

In [197]:
final_lr_model = log_reg.fit(final_data)

In [198]:
new_data.printSchema()
new_data.show(5)

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)

+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|         Company|
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:54|38612 Johnny Stra...|        King Ltd|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:54|21083 Nicole Junc...|   Cannon-

In [201]:
indx_new_data = companies_indexer.fit(new_data).transform(new_data)
test_new = assembler.transform(indx_new_data)
test_new.show(5)

+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+------------+--------------------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|         Company|CompanyIndex|            features|
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+------------+--------------------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:54|38612 Johnny Stra...|        King Ltd|         2.0|[37.0,9935.53,1.0...|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:54|21083 Nicole Junc...|   Cannon-Benson|         1.0|[23.0,7526.94,1.0...|
|  Jeremy Chang|65.0|         100.0|              1|  1.0|     15.0|2006-12-11 07:48:13|085 Austin Views ...|Barron-Robertson|         0.0|[65.0,100.0,1.0,1...|
|Megan Ferguson|32.0|        6487.

In [204]:
final_results = final_lr_model.transform(test_new)
final_results.select(['Company','prediction']).show()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+

