In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark import SparkContext as sc
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer

In [2]:
spark = SparkSession.builder.master("local").appName("mylogreg").getOrCreate()

In [3]:
data = spark.read.option('header','true').csv("C:/Users/user/Projects/Datasets/iris.csv", inferSchema= True)

In [4]:
data.take(1)

[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, variety='Setosa')]

In [5]:
data.show(5)


+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [8]:
#Creating a feature vector using vector assembler
vectorassembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", 
                                             "petal_length", "petal_width"], outputCol="features")

In [9]:
vector_data = vectorassembler.transform(data)

In [10]:
vector_data.show(1)

+------------+-----------+------------+-----------+-------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|variety|         features|
+------------+-----------+------------+-----------+-------+-----------------+
|         5.1|        3.5|         1.4|        0.2| Setosa|[5.1,3.5,1.4,0.2]|
+------------+-----------+------------+-----------+-------+-----------------+
only showing top 1 row



In [11]:
indexer = StringIndexer(inputCol="variety", outputCol="label")

In [12]:
indexed_vectorized_data = indexer.fit(vector_data).transform(vector_data)

In [14]:
indexed_vectorized_data.show(5)

+------------+-----------+------------+-----------+-------+-----------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|variety|         features|label|
+------------+-----------+------------+-----------+-------+-----------------+-----+
|         5.1|        3.5|         1.4|        0.2| Setosa|[5.1,3.5,1.4,0.2]|  2.0|
|         4.9|        3.0|         1.4|        0.2| Setosa|[4.9,3.0,1.4,0.2]|  2.0|
|         4.7|        3.2|         1.3|        0.2| Setosa|[4.7,3.2,1.3,0.2]|  2.0|
|         4.6|        3.1|         1.5|        0.2| Setosa|[4.6,3.1,1.5,0.2]|  2.0|
|         5.0|        3.6|         1.4|        0.2| Setosa|[5.0,3.6,1.4,0.2]|  2.0|
+------------+-----------+------------+-----------+-------+-----------------+-----+
only showing top 5 rows



In [15]:
#Spliting the data into train and test set
train_test_split = indexed_vectorized_data.randomSplit([0.7,0.3])

In [16]:
train_df = train_test_split[0]
test_df = train_test_split[1]

In [20]:
print(train_df.count())
print(test_df.count())

108
42


In [25]:
#Model building and evaluation
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)

In [28]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
dt.evaluator = MulticlassClassificationEvaluator(labelCol="label", 
                                                 predictionCol="prediction", metricName='accuracy')

In [30]:
dt.accuracy = dt.evaluator.evaluate(dt_predictions)
dt.accuracy

0.9285714285714286

The decision tree classifier is able to classify the flower type with an accuracy of 92.86%