
### IRIS Dataset

In [1]:
from pyspark import SparkFiles
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.feature import StringIndexer,IndexToString, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark = SparkSession.builder.appName('Spark-IRIS-Classifier').getOrCreate()

24/01/28 14:37:49 WARN Utils: Your hostname, Sais-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.150.141.237 instead (on interface en0)
24/01/28 14:37:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/28 14:37:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Read Data

In [3]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("iris.data"), header=False, inferSchema= True)
df = spark.createDataFrame(df.rdd, ["PetalWidth","PetalLength","SepalWidth","SepalLength","Label"])

In [4]:
df.show()

+----------+-----------+----------+-----------+-----------+
|PetalWidth|PetalLength|SepalWidth|SepalLength|      Label|
+----------+-----------+----------+-----------+-----------+
|       5.1|        3.5|       1.4|        0.2|Iris-setosa|
|       4.9|        3.0|       1.4|        0.2|Iris-setosa|
|       4.7|        3.2|       1.3|        0.2|Iris-setosa|
|       4.6|        3.1|       1.5|        0.2|Iris-setosa|
|       5.0|        3.6|       1.4|        0.2|Iris-setosa|
|       5.4|        3.9|       1.7|        0.4|Iris-setosa|
|       4.6|        3.4|       1.4|        0.3|Iris-setosa|
|       5.0|        3.4|       1.5|        0.2|Iris-setosa|
|       4.4|        2.9|       1.4|        0.2|Iris-setosa|
|       4.9|        3.1|       1.5|        0.1|Iris-setosa|
|       5.4|        3.7|       1.5|        0.2|Iris-setosa|
|       4.8|        3.4|       1.6|        0.2|Iris-setosa|
|       4.8|        3.0|       1.4|        0.1|Iris-setosa|
|       4.3|        3.0|       1.1|     

### Feature Engineering

In [5]:
# Preparing the data by indexing the classes and putting the features into a vector.
labelIndexer = StringIndexer(inputCol="Label", outputCol="LabelIndex")
vectorAssembler = VectorAssembler(inputCols=["PetalWidth","PetalLength","SepalWidth","SepalLength"],
                                  outputCol="features")
df = vectorAssembler.transform(df)

In [6]:
df.show(5)

+----------+-----------+----------+-----------+-----------+-----------------+
|PetalWidth|PetalLength|SepalWidth|SepalLength|      Label|         features|
+----------+-----------+----------+-----------+-----------+-----------------+
|       5.1|        3.5|       1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|       4.9|        3.0|       1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|       4.7|        3.2|       1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|       4.6|        3.1|       1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|       5.0|        3.6|       1.4|        0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
+----------+-----------+----------+-----------+-----------+-----------------+
only showing top 5 rows



In [7]:
index_model = labelIndexer.fit(df)
data_indexed = index_model.transform(df)
data_indexed.show(5)

+----------+-----------+----------+-----------+-----------+-----------------+----------+
|PetalWidth|PetalLength|SepalWidth|SepalLength|      Label|         features|LabelIndex|
+----------+-----------+----------+-----------+-----------+-----------------+----------+
|       5.1|        3.5|       1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|       0.0|
|       4.9|        3.0|       1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|       0.0|
|       4.7|        3.2|       1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|       0.0|
|       4.6|        3.1|       1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|       0.0|
|       5.0|        3.6|       1.4|        0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|       0.0|
+----------+-----------+----------+-----------+-----------+-----------------+----------+
only showing top 5 rows



In [8]:
train_df, test_df =  data_indexed.randomSplit([0.7, 0.3],0.0)

### Train Model

In [9]:
# Configuring the NaiveBayes classifier and then training it
nb = NaiveBayes().setFeaturesCol("features").setLabelCol("LabelIndex").setSmoothing(1.0).setModelType("multinomial")
model = nb.fit(train_df)

### Prediction

In [10]:
predictions = model.transform(test_df)
predictions.show(5)

+----------+-----------+----------+-----------+-----------+-----------------+----------+--------------------+--------------------+----------+
|PetalWidth|PetalLength|SepalWidth|SepalLength|      Label|         features|LabelIndex|       rawPrediction|         probability|prediction|
+----------+-----------+----------+-----------+-----------+-----------------+----------+--------------------+--------------------+----------+
|       4.3|        3.0|       1.1|        0.1|Iris-setosa|[4.3,3.0,1.1,0.1]|       0.0|[-9.9902097158302...|[0.70655885100260...|       0.0|
|       4.5|        2.3|       1.3|        0.3|Iris-setosa|[4.5,2.3,1.3,0.3]|       0.0|[-10.493440353726...|[0.51837961017414...|       0.0|
|       4.6|        3.6|       1.0|        0.2|Iris-setosa|[4.6,3.6,1.0,0.2]|       0.0|[-11.037834570829...|[0.77208818109349...|       0.0|
|       4.9|        3.1|       1.5|        0.1|Iris-setosa|[4.9,3.1,1.5,0.1]|       0.0|[-11.289649900781...|[0.68248569531348...|       0.0|
|     

24/01/28 14:39:53 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [11]:
# Un-index the data to have species names
converter = IndexToString(inputCol="prediction", outputCol="PredictedLabel", labels=index_model.labels)
converted = converter.transform(predictions)


In [12]:
# Displaying the actual and predicted species side-by-side
converted.select(['features','Label','PredictedLabel']).show(5)

+-----------------+-----------+--------------+
|         features|      Label|PredictedLabel|
+-----------------+-----------+--------------+
|[4.3,3.0,1.1,0.1]|Iris-setosa|   Iris-setosa|
|[4.5,2.3,1.3,0.3]|Iris-setosa|   Iris-setosa|
|[4.6,3.6,1.0,0.2]|Iris-setosa|   Iris-setosa|
|[4.9,3.1,1.5,0.1]|Iris-setosa|   Iris-setosa|
|[4.9,3.1,1.5,0.1]|Iris-setosa|   Iris-setosa|
+-----------------+-----------+--------------+
only showing top 5 rows



### Model Evaluation

#### Using MulticlassClassificationEvaluator() for evaluating the model

In [14]:
e = MulticlassClassificationEvaluator(labelCol="LabelIndex", predictionCol="prediction")


print(e.evaluate(predictions, {e.metricName: "accuracy"}))

print(e.evaluate(predictions, {e.metricName: "f1"}))

print(e.evaluate(predictions, {e.metricName: "weightedPrecision"}))

print(e.evaluate(predictions, {e.metricName: "weightedRecall"}))

print(e.evaluate(predictions, {e.metricName: "weightedTruePositiveRate"}))

print(e.evaluate(predictions, {e.metricName: "weightedFalsePositiveRate"}))

print(e.evaluate(predictions, {e.metricName: "truePositiveRateByLabel", e.metricLabel: 0.0}))

print(e.evaluate(predictions, {e.metricName: "precisionByLabel", e.metricLabel: 1.0}))

print(e.evaluate(predictions, {e.metricName: "recallByLabel", e.metricLabel: 2.0}))

0.926829268292683
0.9264475079533403
0.9425087108013936
0.9268292682926829
0.9268292682926829
0.02682926829268293
1.0
0.7857142857142857
0.7692307692307693


The evaluation metrics for the above Naive Bayes classification model on the IRIS dataset indicate a robust performance overall. With an accuracy of 92.68% and an F1 score of 92.64%, the model demonstrates a high level of correctness and balance between precision and recall. The weighted precision and recall, accounting for the dataset's class imbalance, are impressive at 94.25% and 92.68%, respectively. Notably, the model exhibits perfect recall for the Iris Setosa class (label 0.0) and reasonably high precision for the Iris Versicolor class (label 1.0) at 78.57%. The weighted false positive rate is low at 2.68%, suggesting the model's effectiveness in minimizing misclassifications. Overall, these metrics underscore the model's proficiency in accurately classifying Iris species, with particular strengths in handling the class imbalances inherent in the dataset.