## [Multilayer Perceptron Classifier](https://spark.apache.org/docs/2.1.0/ml-classification-regression.html#multilayer-perceptron-classifier)

In [1]:
from pyspark.sql import SparkSession # to load dataframe we need this
spark = SparkSession.builder.appName('MultilayerPerceptronClassifier').getOrCreate()
data = spark.read.format("libsvm").load("sample_multiclass_classification_data.txt")


In [2]:
data.head(1)

[Row(label=1.0, features=SparseVector(4, {0: -0.2222, 1: 0.5, 2: -0.7627, 3: -0.8333}))]

In [3]:
train, test = data.randomSplit([0.6, 0.4], 1234)


In [4]:
train.count()

99

In [5]:
# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [4, 5, 4, 3]

In [6]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
# train the model
model = trainer.fit(train)

In [7]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.9019607843137255


## Realistic data

In [8]:
data = spark.read.csv('ChurnData.txt', inferSchema=True, header=True)

In [9]:
print('number of columns = ',len(data.columns))# how many columns
data.columns    # list of names of the column 

number of columns =  21


['State',
 'Account Length',
 'Area Code',
 'Phone',
 "Int'l Plan",
 'VMail Plan',
 'VMail Message',
 'Day Mins',
 'Day Calls',
 'Day Charge',
 'Eve Mins',
 'Eve Calls',
 'Eve Charge',
 'Night Mins',
 'Night Calls',
 'Night Charge',
 'Intl Mins',
 'Intl Calls',
 'Intl Charge',
 'CustServ Calls',
 'Churn?']

In [10]:
data = data.withColumnRenamed("Int'l Plan", "IntlPlan")
data.printSchema()

root
 |-- State: string (nullable = true)
 |-- Account Length: integer (nullable = true)
 |-- Area Code: integer (nullable = true)
 |-- Phone: string (nullable = true)
 |-- IntlPlan: string (nullable = true)
 |-- VMail Plan: string (nullable = true)
 |-- VMail Message: integer (nullable = true)
 |-- Day Mins: double (nullable = true)
 |-- Day Calls: integer (nullable = true)
 |-- Day Charge: double (nullable = true)
 |-- Eve Mins: double (nullable = true)
 |-- Eve Calls: integer (nullable = true)
 |-- Eve Charge: double (nullable = true)
 |-- Night Mins: double (nullable = true)
 |-- Night Calls: integer (nullable = true)
 |-- Night Charge: double (nullable = true)
 |-- Intl Mins: double (nullable = true)
 |-- Intl Calls: integer (nullable = true)
 |-- Intl Charge: double (nullable = true)
 |-- CustServ Calls: integer (nullable = true)
 |-- Churn?: string (nullable = true)



In [11]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
func =  udf (lambda x: 0 if x=='False.' else 1, IntegerType())
data = data.withColumn('label', func(col('Churn?')))

data.printSchema()

root
 |-- State: string (nullable = true)
 |-- Account Length: integer (nullable = true)
 |-- Area Code: integer (nullable = true)
 |-- Phone: string (nullable = true)
 |-- IntlPlan: string (nullable = true)
 |-- VMail Plan: string (nullable = true)
 |-- VMail Message: integer (nullable = true)
 |-- Day Mins: double (nullable = true)
 |-- Day Calls: integer (nullable = true)
 |-- Day Charge: double (nullable = true)
 |-- Eve Mins: double (nullable = true)
 |-- Eve Calls: integer (nullable = true)
 |-- Eve Charge: double (nullable = true)
 |-- Night Mins: double (nullable = true)
 |-- Night Calls: integer (nullable = true)
 |-- Night Charge: double (nullable = true)
 |-- Intl Mins: double (nullable = true)
 |-- Intl Calls: integer (nullable = true)
 |-- Intl Charge: double (nullable = true)
 |-- CustServ Calls: integer (nullable = true)
 |-- Churn?: string (nullable = true)
 |-- label: integer (nullable = true)



In [12]:
func =  udf (lambda x: 0 if x=='no' else 1, IntegerType())
data = data.withColumn('new_IntlPlan', func(col('IntlPlan')))

data.printSchema()

root
 |-- State: string (nullable = true)
 |-- Account Length: integer (nullable = true)
 |-- Area Code: integer (nullable = true)
 |-- Phone: string (nullable = true)
 |-- IntlPlan: string (nullable = true)
 |-- VMail Plan: string (nullable = true)
 |-- VMail Message: integer (nullable = true)
 |-- Day Mins: double (nullable = true)
 |-- Day Calls: integer (nullable = true)
 |-- Day Charge: double (nullable = true)
 |-- Eve Mins: double (nullable = true)
 |-- Eve Calls: integer (nullable = true)
 |-- Eve Charge: double (nullable = true)
 |-- Night Mins: double (nullable = true)
 |-- Night Calls: integer (nullable = true)
 |-- Night Charge: double (nullable = true)
 |-- Intl Mins: double (nullable = true)
 |-- Intl Calls: integer (nullable = true)
 |-- Intl Charge: double (nullable = true)
 |-- CustServ Calls: integer (nullable = true)
 |-- Churn?: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- new_IntlPlan: integer (nullable = true)



In [13]:
data = data.drop('Churn?','IntlPlan') 

In [14]:
func =  udf (lambda x: 0 if x=='no' else 1, IntegerType())
data = data.withColumn('new_VMailPlan', func(col('VMail Plan')))
data = data.drop('VMail Plan') 
data.printSchema()

root
 |-- State: string (nullable = true)
 |-- Account Length: integer (nullable = true)
 |-- Area Code: integer (nullable = true)
 |-- Phone: string (nullable = true)
 |-- VMail Message: integer (nullable = true)
 |-- Day Mins: double (nullable = true)
 |-- Day Calls: integer (nullable = true)
 |-- Day Charge: double (nullable = true)
 |-- Eve Mins: double (nullable = true)
 |-- Eve Calls: integer (nullable = true)
 |-- Eve Charge: double (nullable = true)
 |-- Night Mins: double (nullable = true)
 |-- Night Calls: integer (nullable = true)
 |-- Night Charge: double (nullable = true)
 |-- Intl Mins: double (nullable = true)
 |-- Intl Calls: integer (nullable = true)
 |-- Intl Charge: double (nullable = true)
 |-- CustServ Calls: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- new_IntlPlan: integer (nullable = true)
 |-- new_VMailPlan: integer (nullable = true)



In [15]:
import re
data = data.toDF(*(re.sub(r'[\.\s]+', '_', c) for c in data.columns))
data.columns

['State',
 'Account_Length',
 'Area_Code',
 'Phone',
 'VMail_Message',
 'Day_Mins',
 'Day_Calls',
 'Day_Charge',
 'Eve_Mins',
 'Eve_Calls',
 'Eve_Charge',
 'Night_Mins',
 'Night_Calls',
 'Night_Charge',
 'Intl_Mins',
 'Intl_Calls',
 'Intl_Charge',
 'CustServ_Calls',
 'label',
 'new_IntlPlan',
 'new_VMailPlan']

In [16]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols =['Account_Length','Area_Code','Day_Mins','Day_Calls','Day_Charge','Eve_Mins',
                                      'Eve_Calls','Eve_Charge','Night_Mins','Night_Calls','Night_Charge',
                                      'Intl_Mins','Intl_Calls','Intl_Charge','CustServ_Calls','new_IntlPlan','new_VMailPlan'],
                            outputCol = 'features')
# VectorAssembler convert all the inputCols to a dense vector fro each row and output it as 'features'
# we have to do this to satisfy the format which spark mllib deals with

In [17]:
dataFromAssembler = assembler.transform(data)
dataFromAssembler.head(1)

[Row(State='KS', Account_Length=128, Area_Code=415, Phone='382-4657', VMail_Message=25, Day_Mins=265.1, Day_Calls=110, Day_Charge=45.07, Eve_Mins=197.4, Eve_Calls=99, Eve_Charge=16.78, Night_Mins=244.7, Night_Calls=91, Night_Charge=11.01, Intl_Mins=10.0, Intl_Calls=3, Intl_Charge=2.7, CustServ_Calls=1, label=0, new_IntlPlan=0, new_VMailPlan=1, features=DenseVector([128.0, 415.0, 265.1, 110.0, 45.07, 197.4, 99.0, 16.78, 244.7, 91.0, 11.01, 10.0, 3.0, 2.7, 1.0, 0.0, 1.0]))]

In [18]:
final_data = dataFromAssembler.select('features','label')
final_data.head(1)

[Row(features=DenseVector([128.0, 415.0, 265.1, 110.0, 45.07, 197.4, 99.0, 16.78, 244.7, 91.0, 11.01, 10.0, 3.0, 2.7, 1.0, 0.0, 1.0]), label=0)]

In [19]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [23]:
layers = [17, 9, 5,5, 2]
from pyspark.ml.classification import MultilayerPerceptronClassifier
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234, labelCol='label')
# train the model
model = trainer.fit(train_data)

In [24]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# compute accuracy on the test set
result = model.transform(test_data)



In [25]:
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.8554216867469879
