In [1]:
import sys
import os

from pyspark.sql.types import *
import pyspark.sql.functions as func

from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import Bucketizer, OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.regression import GeneralizedLinearRegression, LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder 

import pyspark
from pyspark.sql import SQLContext

if ('sc' not in locals() or 'sc' not in globals()):
    os.environ['PYSPARK_PYTHON'] = '/usr/bin/python2'
    sc = pyspark.SparkContext('local[*]')

sqlContext = SQLContext(sc)

In [2]:
df = sqlContext.read.csv("bezdekIris.data", inferSchema=True)\
.toDF("sep_len", "sep_wid", "pet_len", "pet_wid", "label")

In [3]:
df.show(5)

+-------+-------+-------+-------+-----------+
|sep_len|sep_wid|pet_len|pet_wid|      label|
+-------+-------+-------+-------+-----------+
|    5.1|    3.5|    1.4|    0.2|Iris-setosa|
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|
|    4.6|    3.1|    1.5|    0.2|Iris-setosa|
|    5.0|    3.6|    1.4|    0.2|Iris-setosa|
+-------+-------+-------+-------+-----------+
only showing top 5 rows



In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [5]:
vector_assembler = VectorAssembler(\
inputCols=["sep_len", "sep_wid", "pet_len", "pet_wid"],\
outputCol="features")
df_temp = vector_assembler.transform(df)
df_temp.show(3)

+-------+-------+-------+-------+-----------+-----------------+
|sep_len|sep_wid|pet_len|pet_wid|      label|         features|
+-------+-------+-------+-------+-----------+-----------------+
|    5.1|    3.5|    1.4|    0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
+-------+-------+-------+-------+-----------+-----------------+
only showing top 3 rows



In [6]:
df = df_temp.drop("sep_len").drop("sep_wid").drop("pet_len").drop("pet_wid")
df.show(3)

+-----------+-----------------+
|      label|         features|
+-----------+-----------------+
|Iris-setosa|[5.1,3.5,1.4,0.2]|
|Iris-setosa|[4.9,3.0,1.4,0.2]|
|Iris-setosa|[4.7,3.2,1.3,0.2]|
+-----------+-----------------+
only showing top 3 rows



In [7]:
from pyspark.ml.feature import StringIndexer
l_indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
df = l_indexer.fit(df).transform(df)
df.show(3)

+-----------+-----------------+----------+
|      label|         features|labelIndex|
+-----------+-----------------+----------+
|Iris-setosa|[5.1,3.5,1.4,0.2]|       0.0|
|Iris-setosa|[4.9,3.0,1.4,0.2]|       0.0|
|Iris-setosa|[4.7,3.2,1.3,0.2]|       0.0|
+-----------+-----------------+----------+
only showing top 3 rows



In [8]:
(trainingData, testData) = df.randomSplit([0.7, 0.3])

### Random forest classifier

In [14]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [10]:
rf = RandomForestClassifier(labelCol="labelIndex",\
featuresCol="features", numTrees=10)
model = rf.fit(trainingData)

In [11]:
predictions = model.transform(testData)

In [12]:
predictions.select("prediction", "labelIndex").show(150)

+----------+----------+
|prediction|labelIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       1.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       1.0|    

In [15]:
evaluator =\
MulticlassClassificationEvaluator(labelCol="labelIndex",\
predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.047619


In [16]:
print(model)

RandomForestClassificationModel (uid=rfc_ccd0df0683e0) with 10 trees


In [17]:
model.save('./model.rfc')

In [26]:
from pyspark.ml.classification import RandomForestClassificationModel
new_model = RandomForestClassificationModel.load('./model.rfc')

In [27]:
predictions = new_model.transform(testData)

In [28]:
predictions.select("prediction", "labelIndex").show(150)

+----------+----------+
|prediction|labelIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       1.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       1.0|    