# Random Forest Example

This is just a quick walkthrough of the Documentation's Example of Random Forest:

In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier\
                                        ,GBTClassifier, DecisionTreeClassifier

# Trees for regression
#from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('rf').getOrCreate()

In [3]:
# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").load("sample_libsvm_data.txt")

In [None]:
data.show()

In [None]:
data.head(1)

In [4]:
# Split the data into training and test sets (30% held out for testing)
trainingData, testData = data.randomSplit([0.7, 0.3], seed=42)

In [5]:
trainingData.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [6]:
trainingData.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[95,96,97,12...|
|  0.0|(692,[122,123,124...|
|  0.0|(692,[122,123,148...|
|  0.0|(692,[123,124,125...|
|  0.0|(692,[123,124,125...|
+-----+--------------------+
only showing top 5 rows



In [7]:
testData.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[98,99,100,1...|
|  0.0|(692,[100,101,102...|
|  0.0|(692,[121,122,123...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[126,127,128...|
+-----+--------------------+
only showing top 5 rows



In [8]:
# dtc classifier
dtc = DecisionTreeClassifier(featuresCol='features',labelCol='label')

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=200)

In [9]:
dtc_model = dtc.fit(trainingData)

# Train model.  This also runs the indexers.
rfc_model = rf.fit(trainingData)

In [13]:
# Make predictions. # decision tree
dtc_predictions = dtc_model.transform(testData)

# Make predictions. # random forest
rfc_predictions = rfc_model.transform(testData)

In [14]:
dtc_predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [15]:
#rfc_predictions.printSchema()
#gbt_predictions.printSchema()

In [16]:
# Select example rows to display.
dtc_predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       1.0|  0.0|(692,[98,99,100,1...|
|       0.0|  0.0|(692,[100,101,102...|
|       0.0|  0.0|(692,[121,122,123...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[126,127,128...|
+----------+-----+--------------------+
only showing top 5 rows



In [17]:
rfc_predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[98,99,100,1...|
|       0.0|  0.0|(692,[100,101,102...|
|       0.0|  0.0|(692,[121,122,123...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[126,127,128...|
+----------+-----+--------------------+
only showing top 5 rows



In [18]:
#gbt_predictions.select("prediction", "label", "features").show(5)

In [19]:
# Select (prediction, true label) and compute test error
# MulticlassClassificationEvaluator can be used as binary classification eveluator
# We can directly find out recall, precision other than accuracy etc
evaluator = MulticlassClassificationEvaluator(labelCol="label", 
                                              predictionCol="prediction", 
                                              metricName="accuracy")

In [20]:
accuracy = evaluator.evaluate(dtc_predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.04


In [21]:
accuracy = evaluator.evaluate(rfc_predictions)
print("Test Error = {}".format(1.0 - accuracy))

Test Error = 0.0


In [22]:
# Not a very good example to show this!
# We will see this in next project in Tree models - Consulting project
dtc_model.featureImportances

SparseVector(692, {99: 0.0525, 434: 0.9475})

## Gradient Boosted Trees

Gradient-boosted trees (GBTs) are a popular classification and regression method using ensembles of decision trees. More information about the spark.ml implementation can be found further in the section on [GBTs.](http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-trees-gbts). For more information on the algorithm itself, please see the [spark.mllib documentation on GBTs.](http://spark.apache.org/docs/latest/mllib-ensembles.html#gradient-boosted-trees-gbts)

Luckily Spark makes very easy to use, basically just an import switch:

In [23]:
from pyspark.ml.classification import GBTClassifier

# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").load("sample_libsvm_data.txt")


# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a GBT model.
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=200)

# Train model.  This also runs the indexers.
model = gbt.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[122,123,124...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[125,126,127...|
+----------+-----+--------------------+
only showing top 5 rows



In [24]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label", 
                                              predictionCol="prediction", 
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0


So this data isn't really realistic enough to really judge to effectiveness of GBT models, this data makes it seem like they are perfection, instead of just an improvement on normal Random Forests.