In [1]:
from os import truncate
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import isnull, when, count, col
from pyspark.ml.linalg import Vectors

spark = SparkSession.builder.appName("Iris Data").getOrCreate()

22/11/24 10:13:12 WARN Utils: Your hostname, razer-arch resolves to a loopback address: 127.0.1.1; using 10.0.0.243 instead (on interface wlo1)
22/11/24 10:13:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# loading IRIS dataset
dataset = spark.read.format("csv").option("header", "true").load("Iris.csv")

# displaying the dataset up to 20 records
dataset.show(20)

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [3]:
# required vector transformation to get a dependent column based on independent columns
# here, the dependent column is species
# converting the data type to double before applying vector transformations
dataset = dataset.withColumn(
    "SepalLengthCm", dataset["SepalLengthCm"].cast(DoubleType())
)
dataset = dataset.withColumn("SepalWidthCm", dataset["SepalWidthCm"].cast(DoubleType()))
dataset = dataset.withColumn(
    "PetalLengthCm", dataset["PetalLengthCm"].cast(DoubleType())
)
dataset = dataset.withColumn("PetalWidthCm", dataset["PetalWidthCm"].cast(DoubleType()))

# displaying the dataset up to 20 records
dataset.show(20)

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [4]:
# Assigning the set of columns as input (required features)
# output column as features
required_features = ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]
vectorAssembler = VectorAssembler(inputCols=required_features, outputCol="features")

# Joining them to a single column using VectorAssembler
dataset2 = vectorAssembler.transform(dataset)
dataset2.show(20)

+---+-------------+------------+-------------+------------+-----------+-----------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|
+---+-------------+------------+-------------+------------+-----------+-----------------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|[5.0,3.4,1.5,0.2]|
|  9|     

In [5]:
# So, as we got the feature column , we can drop the independent columns (unnecessary now)
df = dataset2.drop("SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm")
df.show(3)

+---+-----------+-----------------+
| Id|    Species|         features|
+---+-----------+-----------------+
|  1|Iris-setosa|[5.1,3.5,1.4,0.2]|
|  2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|  3|Iris-setosa|[4.7,3.2,1.3,0.2]|
+---+-----------+-----------------+
only showing top 3 rows



In [6]:
# converting text to index value using stringindexer (like 0 or 1)
dataset3 = StringIndexer(inputCol="Species", outputCol="Group")

# Updated values is now set to our current dataset and transformed to binary data
df = dataset3.fit(df).transform(df)
df.show(3)

+---+-----------+-----------------+-----+
| Id|    Species|         features|Group|
+---+-----------+-----------------+-----+
|  1|Iris-setosa|[5.1,3.5,1.4,0.2]|  0.0|
|  2|Iris-setosa|[4.9,3.0,1.4,0.2]|  0.0|
|  3|Iris-setosa|[4.7,3.2,1.3,0.2]|  0.0|
+---+-----------+-----------------+-----+
only showing top 3 rows



In [7]:
# Here, we are using 80% for training and 20% for testing stage
(trainingData, testData) = df.randomSplit([0.8, 0.2])

In [8]:
# using decision tree classifier feeding the data to classifier
dec_tree = DecisionTreeClassifier(labelCol="Group", featuresCol="features")

# Training the model
model = dec_tree.fit(trainingData)

In [9]:
# Feed test data to the model and prediction results are generated.
predictions = model.transform(testData)

# select example rows to display
# Select (prediction, true label) and compute test error.
predictions.select("prediction", "Group").show(5)

+----------+-----+
|prediction|Group|
+----------+-----+
|       2.0|  2.0|
|       2.0|  2.0|
|       0.0|  0.0|
|       2.0|  2.0|
|       2.0|  2.0|
+----------+-----+
only showing top 5 rows



In [10]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="Group", predictionCol="prediction", metricName="accuracy"
)

# Evaluate model on training instances
accuracy = evaluator.evaluate(predictions)

# Finding accuracy using classification evaluator function
print("Test Accuracy of Decision Tree =", accuracy)

Test Accuracy of Decision Tree = 0.9444444444444444
