In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('naive_bayes').getOrCreate()  #创建SparkSession对象

In [3]:
#irisdf = spark.sql("SELECT SepalLength, SepalWidth, PetalLength, PetalWidth, Species FROM iris")
irisdf=spark.read.csv('iris_dataset.csv',inferSchema=True,header=True) 

In [4]:
print((irisdf.count(),len(irisdf.columns)))  

(150, 5)


In [5]:
irisdf.describe().select('summary','sepal_length','sepal_width','petal_length','petal_width').show()

+-------+------------------+-------------------+------------------+------------------+
|summary|      sepal_length|        sepal_width|      petal_length|       petal_width|
+-------+------------------+-------------------+------------------+------------------+
|  count|               150|                150|               150|               150|
|   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|
| stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|
|    min|               4.3|                2.0|               1.0|               0.1|
|    max|               7.9|                4.4|               6.9|               2.5|
+-------+------------------+-------------------+------------------+------------------+



In [6]:
irisdf.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)



In [7]:
from pyspark.ml.feature import StringIndexer
labelIndexer = StringIndexer(inputCol="species", outputCol="label")

In [8]:
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"], outputCol="features")

In [9]:
train, test = irisdf.randomSplit([0.7, 0.3], seed = 100)

In [10]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline

In [11]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

In [12]:
pipeline = Pipeline(stages=[labelIndexer, vecAssembler, nb])   #构建一个stages，其中包含特征转化和贝叶斯估计器

In [14]:
model = pipeline.fit(train)   #pipeline由一系列阶段组成，每个阶段要么是估计器，要么是转换器。当调用fit，按顺序执行阶段

In [15]:
predictions = model.transform(test)

In [16]:
predictions.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [17]:
from pyspark.sql.functions import rand, randn
predictions.select("label", "prediction", "probability").orderBy(rand()).show(10,truncate=False)

+-----+----------+-------------------------------------------------------------+
|label|prediction|probability                                                  |
+-----+----------+-------------------------------------------------------------+
|0.0  |0.0       |[0.5112170900860498,0.42378835062808623,0.06499455928586405] |
|0.0  |0.0       |[0.5025414372542717,0.41477046961196545,0.08268809313376278] |
|2.0  |2.0       |[0.15134048617914386,0.0781776035394255,0.7704819102814305]  |
|1.0  |1.0       |[0.4861766353137803,0.5021723246946296,0.011651039991590116] |
|1.0  |1.0       |[0.4728547962801507,0.5179831760092096,0.009162027710639722] |
|1.0  |1.0       |[0.4633454865070346,0.530831347337991,0.005823166154974355]  |
|2.0  |2.0       |[0.24774327022067563,0.1381935673888351,0.6140631623904893]  |
|0.0  |0.0       |[0.5200371667676991,0.4304739552641891,0.04948887796811193]  |
|0.0  |0.0       |[0.5117917806583538,0.4527847963979916,0.03542342294365454]  |
|1.0  |1.0       |[0.4732219

In [18]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator    #为了评估我们的模型，我们将在多类分类中使用评估器

In [19]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName='accuracy')

In [20]:
accuracy = evaluator.evaluate(predictions)

In [25]:
print (u'预测的准确性：{}'.format(accuracy))

预测的准确性：0.936170212766


In [24]:
evaluator.explainParam("metricName")

'metricName: metric name in evaluation (f1|weightedPrecision|weightedRecall|accuracy) (default: f1, current: accuracy)'