In [None]:
appname = "Logistic Regression - Titanic"

# Look into https://spark.apache.org/downloads.html for the latest version
spark_mirror = "https://mirrors.sonic.net/apache/spark"
spark_version = "3.3.1"
hadoop_version = "3"

# Install Java 8 (Spark does not work with newer Java versions)
! apt-get update
! apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Download and extract Spark binary distribution
! rm -rf spark-{spark_version}-bin-hadoop{hadoop_version}.tgz spark-{spark_version}-bin-hadoop{hadoop_version}
! wget -q {spark_mirror}/spark-{spark_version}/spark-{spark_version}-bin-hadoop{hadoop_version}.tgz
! tar xzf spark-{spark_version}-bin-hadoop{hadoop_version}.tgz

# The only 2 environment variables needed to set up Java and Spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/spark-{spark_version}-bin-hadoop{hadoop_version}"

# Set up the Spark environment based on the environment variable SPARK_HOME 
! pip install -q findspark
import findspark
findspark.init()

# Get the Spark session object (basic entry point for every operation)
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName(appname).master("local[*]").getOrCreate()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Get:4 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:10 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 kB]
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:14 http://security.ubuntu.com/ubuntu 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from pyspark import SparkFiles
spark.sparkContext.addFile('/content/drive/MyDrive/Colab Notebooks/customer_churn.csv')
churn = spark.read.option("header","true").csv('/content/drive/MyDrive/Colab Notebooks/customer_churn.csv')

churn.show(10)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|  Cynthia Norton|37.0|    

####Removing the features that are mostly unique to each row because none would inffluence the results

In [29]:
data = churn.drop('Names','Location', 'Company')

####Attribute selection for year within Onboard_date feature

In [None]:
from pyspark.sql.functions import *

data = data.withColumn('Onb_year', year(col('Onboard_date')))

data.select([count(when(isnan(x) | col(x).isNull(), x)).alias(x) for x in data.columns]
   ).show()
  
data.printSchema()

####Removing Onboard_date as it is redundant data now

In [17]:
data = data.drop('Onboard_date')

data.show(3)

+----+--------------+---------------+-----+---------+-----+--------+
| Age|Total_Purchase|Account_Manager|Years|Num_Sites|Churn|Onb_year|
+----+--------------+---------------+-----+---------+-----+--------+
|42.0|       11066.8|              0| 7.22|      8.0|    1|    2013|
|41.0|      11916.22|              0|  6.5|     11.0|    1|    2013|
|38.0|      12884.75|              0| 6.67|     12.0|    1|    2016|
+----+--------------+---------------+-----+---------+-----+--------+
only showing top 3 rows



####Transforming every type of data to float so it can be used for further tasks.

In [18]:
for y in data.columns[0:len(data.columns)]:
  data=data.withColumn(y,col(y).cast("float"))

data.printSchema()
data.show(2)

root
 |-- Age: float (nullable = true)
 |-- Total_Purchase: float (nullable = true)
 |-- Account_Manager: float (nullable = true)
 |-- Years: float (nullable = true)
 |-- Num_Sites: float (nullable = true)
 |-- Churn: float (nullable = true)
 |-- Onb_year: float (nullable = true)

+----+--------------+---------------+-----+---------+-----+--------+
| Age|Total_Purchase|Account_Manager|Years|Num_Sites|Churn|Onb_year|
+----+--------------+---------------+-----+---------+-----+--------+
|42.0|       11066.8|            0.0| 7.22|      8.0|  1.0|  2013.0|
|41.0|      11916.22|            0.0|  6.5|     11.0|  1.0|  2013.0|
+----+--------------+---------------+-----+---------+-----+--------+
only showing top 2 rows



####Dividing data into trains and test

In [19]:
train_data, test_data = data.randomSplit([0.67, 0.33])

####Setting the distinct functions to transform the data in order to make it suitable for the prediction algorithm

Using VectorAssembler to merge all the data into one column
Using 


In [20]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['Age','Total_Purchase','Account_Manager','Years','Num_Sites','Onb_year'],
                            outputCol='features')

Using StandardScaler to remove scale inffluence


In [21]:
from pyspark.ml.feature import StandardScaler, MinMaxScaler

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

Using Logistic Regression to predict Churn as it's more efficient to predict binary values than linear regression.

In [22]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol='scaledFeatures', labelCol='Churn')

Using Pipeline to merge these steps

In [23]:
from pyspark.ml import Pipeline
#creating a pipeline to use the same steps as before to the train data.
pipeline = Pipeline(stages=[assembler, scaler, lr])

model = pipeline.fit(train_data)

In [24]:
predictions = model.transform(test_data)

predictions.show(10)

+----+--------------+---------------+-----+---------+-----+--------+--------------------+--------------------+--------------------+--------------------+----------+
| Age|Total_Purchase|Account_Manager|Years|Num_Sites|Churn|Onb_year|            features|      scaledFeatures|       rawPrediction|         probability|prediction|
+----+--------------+---------------+-----+---------+-----+--------+--------------------+--------------------+--------------------+--------------------+----------+
|26.0|       8939.61|            0.0| 4.54|      7.0|  0.0|  2007.0|[26.0,8939.610351...|[4.23525664817445...|[5.31880353241330...|[0.99512526816624...|       0.0|
|28.0|       8670.98|            0.0| 3.99|      6.0|  0.0|  2006.0|[28.0,8670.980468...|[4.56104562111094...|[6.56983302511471...|[0.99859993144357...|       0.0|
|29.0|       8688.17|            1.0|  5.7|      9.0|  1.0|  2015.0|[29.0,8688.169921...|[4.72394010757919...|[2.40509852854169...|[0.91721526645361...|       0.0|
|29.0|      1020

####Evaluating the model

In [25]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Churn')
area_uroc = evaluator.evaluate(predictions)
area_uroc

0.7734480431848854

In [34]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

multieval = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Churn')
#print(multieval.explainParams())

mod_accuracy = multieval.evaluate(predictions, {multieval.metricName: "accuracy"})
mod_f1 = multieval.evaluate(predictions, {multieval.metricName: "f1"})
mod_posrate = multieval.evaluate(predictions, {multieval.metricName: "weightedPrecision"})
mod_falserate = multieval.evaluate(predictions, {multieval.metricName: "weightedFalsePositiveRate"})

In [35]:

print("Accuracy:", mod_accuracy) #Same as weightedTruePositiveRate
print("WeightedFalsePositiveRate:", mod_falserate)
print("WeightedPrecision:", mod_posrate)
print("f1:", mod_f1)

Accuracy: 0.9016949152542373
WeightedFalsePositiveRate: 0.35479882888446673
WeightedPrecision: 0.8955153813648099
f1: 0.8964046474438072
