In [1]:
!pip install pyspark
from pyspark.sql import SparkSession



In [5]:
spark = SparkSession.builder.appName('IrisDataset').getOrCreate()

dataset = spark.read.csv('/content/bezdekIris.data',inferSchema=True, header =True)\
.toDF("sep_len", "sep_wid", "pet_len", "pet_wid", "label")

dataset.select('label').distinct().show(10)
dataset.count()

+---------------+
|          label|
+---------------+
| Iris-virginica|
|    Iris-setosa|
|Iris-versicolor|
+---------------+



149

In [6]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
vector_assembler = VectorAssembler(\
inputCols=["sep_len", "sep_wid", "pet_len", "pet_wid"],\
outputCol="features")
df_temp = vector_assembler.transform(dataset)
df_temp.show(3)

+-------+-------+-------+-------+-----------+-----------------+
|sep_len|sep_wid|pet_len|pet_wid|      label|         features|
+-------+-------+-------+-------+-----------+-----------------+
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|    4.6|    3.1|    1.5|    0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
+-------+-------+-------+-------+-----------+-----------------+
only showing top 3 rows



In [7]:
#Let’s remove unnecessary columns:
df = df_temp.drop('sep_len', 'sep_wid', 'pet_len', 'pet_wid')
df.show(3)

+-----------+-----------------+
|      label|         features|
+-----------+-----------------+
|Iris-setosa|[4.9,3.0,1.4,0.2]|
|Iris-setosa|[4.7,3.2,1.3,0.2]|
|Iris-setosa|[4.6,3.1,1.5,0.2]|
+-----------+-----------------+
only showing top 3 rows



In [8]:
from pyspark.ml.feature import StringIndexer
l_indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
df = l_indexer.fit(df).transform(df)

df.select('label','labelIndex').distinct().show(3)

+---------------+----------+
|          label|labelIndex|
+---------------+----------+
|Iris-versicolor|       0.0|
| Iris-virginica|       1.0|
|    Iris-setosa|       2.0|
+---------------+----------+



In [9]:
(trainingData, testData) = df.randomSplit([0.7, 0.3])

In [10]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
dt = DecisionTreeClassifier(labelCol="labelIndex", featuresCol="features",impurity='entropy', maxDepth=4,seed=1234)
model = dt.fit(trainingData)
predictions = model.transform(testData)

In [12]:
evaluator = MulticlassClassificationEvaluator(\
labelCol="labelIndex", predictionCol="prediction",\
metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test accuracy =  " , accuracy)
print(model.toDebugString)

Test accuracy =   0.9183673469387755
DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6243357f3867, depth=4, numNodes=9, numClasses=3, numFeatures=4
  If (feature 2 <= 2.35)
   Predict: 2.0
  Else (feature 2 > 2.35)
   If (feature 3 <= 1.65)
    If (feature 2 <= 4.95)
     Predict: 0.0
    Else (feature 2 > 4.95)
     If (feature 1 <= 2.6500000000000004)
      Predict: 1.0
     Else (feature 1 > 2.6500000000000004)
      Predict: 0.0
   Else (feature 3 > 1.65)
    Predict: 1.0



In [14]:


# this is code for multiple classification using logistic Regression
from pyspark.ml.classification import OneVsRest
from pyspark.ml.classification import LogisticRegression
train, test = df.randomSplit([0.7, 0.3], seed = 2018)
lr = LogisticRegression(maxIter=100, \

                        featuresCol="features", \

                        labelCol='labelIndex')

In [18]:
#from pyspark.ml import Pipeline
#pipeline_ovr = Pipeline(stages=[vecAssembler, stdScaler, ovr])
#pipelineModel_ovr = pipeline_ovr.fit(trainDF)




In [22]:

print("Test accuracy =  " , accuracy)

Test accuracy =   0.9361702127659575
