## [Multilayer Perceptron Classifier](https://spark.apache.org/docs/2.1.0/ml-classification-regression.html#multilayer-perceptron-classifier)

In [1]:
import findspark
findspark.init('/home/asif/spark-2.1.0-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('FFonSeedData').getOrCreate()

In [2]:
SeedData = spark.read.csv('seed_data.txt', inferSchema=True,sep='\t')

In [3]:
SeedData.printSchema()
SeedData.columns

root
 |-- _c0: double (nullable = true)
 |-- _c1: double (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: double (nullable = true)
 |-- _c5: double (nullable = true)
 |-- _c6: double (nullable = true)
 |-- _c7: integer (nullable = true)



['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7']

In [4]:
SeedData = SeedData.withColumnRenamed("_c0", "area").withColumnRenamed("_c1", "perimeter")\
                   .withColumnRenamed("_c2", "compactness").withColumnRenamed("_c3", "length_of_kernel")\
                   .withColumnRenamed("_c4", "width_of_kernel").withColumnRenamed("_c5", "asymmetry_coefficient")\
                   .withColumnRenamed("_c6", "length_of_kernel_groove").withColumnRenamed("_c7", "label")
SeedData.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_kernel_groove: double (nullable = true)
 |-- label: integer (nullable = true)



In [5]:
SeedData.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_kernel_groove=5.22, label=1)]

In [6]:
from pyspark.sql.types import IntegerType

SeedData = SeedData.withColumn("label", SeedData["label"].cast(IntegerType()))
SeedData.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_kernel_groove: double (nullable = true)
 |-- label: integer (nullable = true)



In [7]:
print(SeedData.select('label').distinct().count())
SeedData.select('label').distinct().show()

3
+-----+
|label|
+-----+
|    1|
|    3|
|    2|
+-----+



In [8]:

from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
func =  udf (lambda x: int(x-1), IntegerType())
SeedData = SeedData.withColumn('label', func(col('label')))
SeedData.show()
SeedData.printSchema()
print(SeedData.select('label').distinct().count())
SeedData.select('label').distinct().show()

+-----+---------+-----------+----------------+---------------+---------------------+-----------------------+-----+
| area|perimeter|compactness|length_of_kernel|width_of_kernel|asymmetry_coefficient|length_of_kernel_groove|label|
+-----+---------+-----------+----------------+---------------+---------------------+-----------------------+-----+
|15.26|    14.84|      0.871|           5.763|          3.312|                2.221|                   5.22|    0|
|14.88|    14.57|     0.8811|           5.554|          3.333|                1.018|                  4.956|    0|
|14.29|    14.09|      0.905|           5.291|          3.337|                2.699|                  4.825|    0|
|13.84|    13.94|     0.8955|           5.324|          3.379|                2.259|                  4.805|    0|
|16.14|    14.99|     0.9034|           5.658|          3.562|                1.355|                  5.175|    0|
|14.38|    14.21|     0.8951|           5.386|          3.312|                2.

In [9]:
SeedData.select('label').show()

+-----+
|label|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
+-----+
only showing top 20 rows



In [10]:
SeedData.describe('label').show()

+-------+-----------------+
|summary|            label|
+-------+-----------------+
|  count|              210|
|   mean|              1.0|
| stddev|0.818447591071135|
|    min|                0|
|    max|                2|
+-------+-----------------+



In [11]:
SeedData.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_kernel_groove',
 'label']

In [12]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols =['area', 'perimeter','compactness','length_of_kernel',
                                         'width_of_kernel','asymmetry_coefficient','length_of_kernel_groove'],
                            outputCol = 'features')

In [13]:
dataFromAssembler = assembler.transform(SeedData)
dataFromAssembler.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_kernel_groove=5.22, label=0, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]))]

In [14]:
final_data = dataFromAssembler.select('features','label')
final_data.head(1)

[Row(features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), label=0)]

In [15]:
train_data, test_data = final_data.randomSplit([0.7,0.3])
layers = [7, 7, 5,3, 3]
from pyspark.ml.classification import MultilayerPerceptronClassifier
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234, labelCol='label')
# train the model
model = trainer.fit(train_data)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# compute accuracy on the test set
result = model.transform(test_data)


In [16]:
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.8333333333333334
