# Random Forest

## Module 11. Scalable Data processing: Machine learning and Streaming

## Pablo Yañez Martin

In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorIndexer, IndexToString
from pyspark.sql import SparkSession

In [2]:
spark_session = SparkSession\
    .builder\
    .appName("SparkML Random Forest")\
    .master("local[*]")\
    .getOrCreate()

In [5]:
data = spark_session\
    .read\
    .format("libsvm")\
    .load("D:/Dropbox/Pablo/Master/Modulo 11. Scalable Data processing. Machine Learning and Streaming/Practicas/Datos/breast-cancer_scale")

In [6]:
data.printSchema()
data.show()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  2.0|(10,[0,1,2,3,4,5,...|
|  2.0|(10,[0,1,2,3,4,5,...|
|  2.0|(10,[0,1,2,3,4,5,...|
|  2.0|(10,[0,1,2,3,4,5,...|
|  2.0|(10,[0,1,2,3,4,5,...|
|  4.0|(10,[0,1,2,3,4,5,...|
|  2.0|(10,[0,1,2,3,4,5,...|
|  2.0|(10,[0,1,2,3,4,5,...|
|  2.0|(10,[0,1,2,3,4,5,...|
|  2.0|(10,[0,1,2,3,4,5,...|
|  2.0|(10,[0,1,2,3,4,5,...|
|  2.0|(10,[0,1,2,3,4,5,...|
|  4.0|(10,[0,1,2,3,4,5,...|
|  2.0|(10,[0,1,2,3,4,5,...|
|  4.0|(10,[0,1,2,3,4,5,...|
|  4.0|(10,[0,1,2,3,4,5,...|
|  2.0|(10,[0,1,2,3,4,5,...|
|  2.0|(10,[0,1,2,3,4,5,...|
|  4.0|(10,[0,1,2,3,4,5,...|
|  2.0|(10,[0,1,2,3,4,5,...|
+-----+--------------------+
only showing top 20 rows



In [7]:
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)

In [8]:
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=2).fit(data)

In [9]:
(trainingData, testData) = data.randomSplit([0.75, 0.25])

In [10]:
random_forest = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

In [11]:
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

In [12]:
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, random_forest, labelConverter])

In [13]:
model = pipeline.fit(trainingData)

In [14]:
predictions = model.transform(testData)

In [15]:
predictions.select("predictedLabel", "label", "features").show(5)

+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|           2.0|  2.0|(10,[0,1,2,3,4,5,...|
|           4.0|  2.0|(10,[0,1,2,3,4,5,...|
|           2.0|  2.0|(10,[0,1,2,3,4,5,...|
|           2.0|  2.0|(10,[0,1,2,3,4,5,...|
|           2.0|  2.0|(10,[0,1,2,3,4,5,...|
+--------------+-----+--------------------+
only showing top 5 rows



In [16]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.0232558


In [17]:
rfModel = model.stages[2]
print(rfModel)

RandomForestClassificationModel: uid=RandomForestClassifier_a902e947c9ba, numTrees=10, numClasses=2, numFeatures=10
