In [1]:
import sys
import os

from pyspark.sql.types import *
import pyspark.sql.functions as func

from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import Bucketizer, OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.regression import GeneralizedLinearRegression, LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder 

import pyspark
from pyspark.sql import SQLContext

if ('sc' not in locals() or 'sc' not in globals()):
    os.environ['PYSPARK_PYTHON'] = '/usr/bin/python2'
    sc = pyspark.SparkContext('local[*]')

sqlContext = SQLContext(sc)

In [2]:
df = sqlContext.read.csv("bezdekIris.data", inferSchema=True)\
.toDF("sep_len", "sep_wid", "pet_len", "pet_wid", "label")

In [3]:
df.show(5)

+-------+-------+-------+-------+-----------+
|sep_len|sep_wid|pet_len|pet_wid|      label|
+-------+-------+-------+-------+-----------+
|    5.1|    3.5|    1.4|    0.2|Iris-setosa|
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|
|    4.6|    3.1|    1.5|    0.2|Iris-setosa|
|    5.0|    3.6|    1.4|    0.2|Iris-setosa|
+-------+-------+-------+-------+-----------+
only showing top 5 rows



In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [5]:
vector_assembler = VectorAssembler(\
inputCols=["sep_len", "sep_wid", "pet_len", "pet_wid"],\
outputCol="features")
df_temp = vector_assembler.transform(df)
df_temp.show(3)

+-------+-------+-------+-------+-----------+-----------------+
|sep_len|sep_wid|pet_len|pet_wid|      label|         features|
+-------+-------+-------+-------+-----------+-----------------+
|    5.1|    3.5|    1.4|    0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
+-------+-------+-------+-------+-----------+-----------------+
only showing top 3 rows



In [6]:
df = df_temp.drop("sep_len").drop("sep_wid").drop("pet_len").drop("pet_wid")
df.show(3)

+-----------+-----------------+
|      label|         features|
+-----------+-----------------+
|Iris-setosa|[5.1,3.5,1.4,0.2]|
|Iris-setosa|[4.9,3.0,1.4,0.2]|
|Iris-setosa|[4.7,3.2,1.3,0.2]|
+-----------+-----------------+
only showing top 3 rows



In [7]:
from pyspark.ml.feature import StringIndexer
l_indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
df = l_indexer.fit(df).transform(df)
df.show(3)

+-----------+-----------------+----------+
|      label|         features|labelIndex|
+-----------+-----------------+----------+
|Iris-setosa|[5.1,3.5,1.4,0.2]|       0.0|
|Iris-setosa|[4.9,3.0,1.4,0.2]|       0.0|
|Iris-setosa|[4.7,3.2,1.3,0.2]|       0.0|
+-----------+-----------------+----------+
only showing top 3 rows



In [8]:
(trainingData, testData) = df.randomSplit([0.7, 0.3])

### Naive Bayes classifier

In [9]:
splits = df.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

In [10]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [11]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(labelCol="labelIndex",\
featuresCol="features", smoothing=1.0,\
modelType="multinomial")
model = nb.fit(train)

In [12]:
predictions = model.transform(test)
predictions.select("label", "labelIndex", 
"probability", "prediction").show(150)

+---------------+----------+--------------------+----------+
|          label|labelIndex|         probability|prediction|
+---------------+----------+--------------------+----------+
|    Iris-setosa|       0.0|[0.72723788653438...|       0.0|
|    Iris-setosa|       0.0|[0.64170595827692...|       0.0|
|    Iris-setosa|       0.0|[0.67184222484015...|       0.0|
|    Iris-setosa|       0.0|[0.68647236934182...|       0.0|
|    Iris-setosa|       0.0|[0.79151826954673...|       0.0|
|    Iris-setosa|       0.0|[0.66189579367600...|       0.0|
|    Iris-setosa|       0.0|[0.65307352257988...|       0.0|
|    Iris-setosa|       0.0|[0.73045962362363...|       0.0|
|    Iris-setosa|       0.0|[0.59100133493054...|       0.0|
|    Iris-setosa|       0.0|[0.75334864217418...|       0.0|
|    Iris-setosa|       0.0|[0.71961717211598...|       0.0|
|    Iris-setosa|       0.0|[0.70085626812960...|       0.0|
|    Iris-setosa|       0.0|[0.75135089290995...|       0.0|
|    Iris-setosa|       

In [13]:
evaluator =\
MulticlassClassificationEvaluator(labelCol="labelIndex",\
predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.8235294117647058


In [14]:
print(model)

NaiveBayes_4b2a950a114493b3672a


In [15]:
model.save('./model.nbc')

In [22]:
from pyspark.ml.classification import NaiveBayesModel
new_model = NaiveBayesModel.load('./model.nbc')

In [23]:
predictions = new_model.transform(testData)

In [24]:
predictions.select("prediction", "labelIndex").show(150)

+----------+----------+
|prediction|labelIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       2.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       2.0|       1.0|
|       2.0|       1.0|
|       1.0|       1.0|
|       2.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       2.0|       1.0|
|       2.0|       1.0|
|       2.0|       1.0|
|       2.0|       1.0|
|       2.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|    