In [1]:
# downloads
!pip install -Uq pyspark py4j

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
# standard imports
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
# creating spark session
spark = SparkSession.builder.appName('irisClassification').getOrCreate()
spark

In [4]:
# loading data
df = spark.createDataFrame(data=pd.read_csv('/content/irisDataset.csv')).cache()
print(f'DataFrame cached: {df.is_cached}')
df.show(10)

DataFrame cached: True
+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 10 rows



In [5]:
# getting df shape
print(f'Shape: {df.count()}, {len(df.columns)}')

Shape: 150, 5


In [6]:
# string indexing
indexer = StringIndexer(inputCol='species', outputCol='label')
indexerModel = indexer.fit(df)
df = indexerModel.transform(df)
df.show(10)

+------------+-----------+------------+-----------+-------+-----+
|sepal_length|sepal_width|petal_length|petal_width|species|label|
+------------+-----------+------------+-----------+-------+-----+
|         5.1|        3.5|         1.4|        0.2| setosa|  0.0|
|         4.9|        3.0|         1.4|        0.2| setosa|  0.0|
|         4.7|        3.2|         1.3|        0.2| setosa|  0.0|
|         4.6|        3.1|         1.5|        0.2| setosa|  0.0|
|         5.0|        3.6|         1.4|        0.2| setosa|  0.0|
|         5.4|        3.9|         1.7|        0.4| setosa|  0.0|
|         4.6|        3.4|         1.4|        0.3| setosa|  0.0|
|         5.0|        3.4|         1.5|        0.2| setosa|  0.0|
|         4.4|        2.9|         1.4|        0.2| setosa|  0.0|
|         4.9|        3.1|         1.5|        0.1| setosa|  0.0|
+------------+-----------+------------+-----------+-------+-----+
only showing top 10 rows



In [7]:
# getting target valuecounts
df.groupby('species', 'label').count().show()

+----------+-----+-----+
|   species|label|count|
+----------+-----+-----+
|    setosa|  0.0|   50|
|versicolor|  1.0|   50|
| virginica|  2.0|   50|
+----------+-----+-----+



In [8]:
# assembling input features
assembler = VectorAssembler(inputCols=df.drop('species', 'label').columns, outputCol='features')
df = assembler.transform(df)
df.show(10, truncate=False)

+------------+-----------+------------+-----------+-------+-----+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|species|label|features         |
+------------+-----------+------------+-----------+-------+-----+-----------------+
|5.1         |3.5        |1.4         |0.2        |setosa |0.0  |[5.1,3.5,1.4,0.2]|
|4.9         |3.0        |1.4         |0.2        |setosa |0.0  |[4.9,3.0,1.4,0.2]|
|4.7         |3.2        |1.3         |0.2        |setosa |0.0  |[4.7,3.2,1.3,0.2]|
|4.6         |3.1        |1.5         |0.2        |setosa |0.0  |[4.6,3.1,1.5,0.2]|
|5.0         |3.6        |1.4         |0.2        |setosa |0.0  |[5.0,3.6,1.4,0.2]|
|5.4         |3.9        |1.7         |0.4        |setosa |0.0  |[5.4,3.9,1.7,0.4]|
|4.6         |3.4        |1.4         |0.3        |setosa |0.0  |[4.6,3.4,1.4,0.3]|
|5.0         |3.4        |1.5         |0.2        |setosa |0.0  |[5.0,3.4,1.5,0.2]|
|4.4         |2.9        |1.4         |0.2        |setosa |0.0  |[4.4,2.9,1.

In [9]:
# making final df
finalDf = df.select('features', 'label')
finalDf.show(10, truncate=False)

+-----------------+-----+
|features         |label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|0.0  |
|[4.9,3.0,1.4,0.2]|0.0  |
|[4.7,3.2,1.3,0.2]|0.0  |
|[4.6,3.1,1.5,0.2]|0.0  |
|[5.0,3.6,1.4,0.2]|0.0  |
|[5.4,3.9,1.7,0.4]|0.0  |
|[4.6,3.4,1.4,0.3]|0.0  |
|[5.0,3.4,1.5,0.2]|0.0  |
|[4.4,2.9,1.4,0.2]|0.0  |
|[4.9,3.1,1.5,0.1]|0.0  |
+-----------------+-----+
only showing top 10 rows



In [10]:
# splitting df
trainDf, testDf = finalDf.randomSplit(weights=[0.8, 0.2], seed=42)

In [11]:
# model training
mlp = MultilayerPerceptronClassifier(layers=[4, 10, 10, 3], seed=42)
mlpModel = mlp.fit(trainDf)

In [12]:
# model evaluation
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
print('Accuracy:', (evaluator.evaluate(mlpModel.transform(testDf))))

Accuracy: 1.0


In [13]:
# stopping session
spark.stop()