# Tree Methods

Following are the 3 different types of tree methods:

- A single decision tree
- A random forest
- A gradient boosted tree classifier

### Initialize and create a spark session

In [1]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().appName("tree_methods").getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://Varun-CK:4040
SparkContext available as 'sc' (version = 2.3.0, master = local[*], app id = local-1577735186953)
SparkSession available as 'spark'


2019-12-31 01:16:36 WARN  SparkContext:66 - Using an existing SparkContext; some configuration may not take effect.


import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@64054634


### Initializing Logger

In [2]:
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

import org.apache.log4j._


### Import statements to setup ML

In [3]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors

import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors


### Reading data file

In [4]:
val data = spark.read.format("libsvm").load("sample_libsvm_data.txt")

data: org.apache.spark.sql.DataFrame = [label: double, features: vector]


### Count

In [5]:
data.count

res1: Long = 100


### Schema

In [6]:
data.printSchema

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



### Show

In [7]:
data.show(3)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 3 rows



### Split the data into training and test sets (30% held out for testing)

In [8]:
val Array(train_data,test_data) = data.randomSplit(Array(0.7,0.3))

train_data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]
test_data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]


In [9]:
train_data.count

res4: Long = 73


In [10]:
test_data.count

res5: Long = 27


## 1. Decision Tree Classifier Example

In [11]:
import org.apache.spark.ml.classification.DecisionTreeClassifier

import org.apache.spark.ml.classification.DecisionTreeClassifier


### Train a DecisionTreeClassifier model

In [12]:
val dtc = new DecisionTreeClassifier().setLabelCol("label").setFeaturesCol("features")

dtc: org.apache.spark.ml.classification.DecisionTreeClassifier = dtc_fd96a3646632


### Train model

In [13]:
val dtc_model = dtc.fit(train_data)

dtc_model: org.apache.spark.ml.classification.DecisionTreeClassificationModel = DecisionTreeClassificationModel (uid=dtc_fd96a3646632) of depth 1 with 3 nodes


### Make predictions

In [15]:
val dtc_predictions = dtc_model.transform(test_data)

dtc_predictions: org.apache.spark.sql.DataFrame = [label: double, features: vector ... 3 more fields]


In [16]:
dtc_predictions.printSchema

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



### Select example rows to display

In [17]:
dtc_predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[98,99,100,1...|
|       0.0|  0.0|(692,[121,122,123...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 5 rows



### Select (prediction, true label) and compute test error

In [18]:
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator


In [19]:
val dtc_evaluator = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("accuracy")

dtc_evaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_02d7ded17a49


In [20]:
val dtc_accuracy = dtc_evaluator.evaluate(dtc_predictions)

dtc_accuracy: Double = 0.9259259259259259


In [26]:
println(f"Decision Tree Model Accuracy: ${dtc_accuracy}%1.2f")

Decision Tree Model Accuracy: 0.93


In [28]:
println(f"Test Error: ${1-dtc_accuracy}%1.2f")

Test Error: 0.07


#### featureImportances

In [29]:
dtc_model.featureImportances

res14: org.apache.spark.ml.linalg.Vector = (692,[406],[1.0])


## 2. Random Forest Example

In [30]:
import org.apache.spark.ml.classification.RandomForestClassifier

import org.apache.spark.ml.classification.RandomForestClassifier


### Train a RandomForest model

In [31]:
val rfc = new RandomForestClassifier().setLabelCol("label").setFeaturesCol("features").setNumTrees(20)

rfc: org.apache.spark.ml.classification.RandomForestClassifier = rfc_bee2362f1689


### Train model, this also runs the indexers

In [32]:
val rfc_model = rfc.fit(train_data)

rfc_model: org.apache.spark.ml.classification.RandomForestClassificationModel = RandomForestClassificationModel (uid=rfc_bee2362f1689) with 20 trees


### Make predictions

In [33]:
val rfc_predictions = rfc_model.transform(test_data)

rfc_predictions: org.apache.spark.sql.DataFrame = [label: double, features: vector ... 3 more fields]


In [34]:
rfc_predictions.printSchema

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



### Select example rows to display

In [35]:
rfc_predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[98,99,100,1...|
|       0.0|  0.0|(692,[121,122,123...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 5 rows



### Select (prediction, true label) and compute test error

In [36]:
val rfc_evaluator = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("accuracy")

rfc_evaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_1688e4988cd8


In [37]:
val rfc_accuracy = rfc_evaluator.evaluate(rfc_predictions)

rfc_accuracy: Double = 0.8888888888888888


In [38]:
println(f"Random Forest Classifier Model Accuracy: ${rfc_accuracy}%1.2f")

Random Forest Classifier Model Accuracy: 0.89


In [39]:
println(f"Test Error: ${1-rfc_accuracy}%1.2f")

Test Error: 0.11


#### featureImportances

In [40]:
rfc_model.featureImportances

res19: org.apache.spark.ml.linalg.Vector = (692,[99,202,208,263,264,266,272,296,317,319,341,350,377,385,386,397,405,407,427,429,433,441,461,462,463,483,489,512,517,518,521,522,524,525,544,552,553,578,627],[0.0032380952380952374,0.006385369287416075,0.005989159891598911,0.00579322638146167,0.005214940098661027,0.0024801587301587287,0.002592592592592594,0.0023035230352303512,0.04420677361853832,0.002289094650205758,0.005624483043837878,0.04740740740740739,0.04026233353936055,0.04132553606237815,0.034033613445378134,0.002325581395348839,0.04478505990133896,0.04480056980056978,0.04733382030679327,0.030965391621129316,0.015862068965517232,0.04333333333333333,0.01596638655462185,0.09999999999999996,0.03731617647058822,0.01903460837887066,0.01268382352941176,0.08636507936507935,0.0499999999999...

## Gradient Boosted Trees

Gradient-boosted trees (GBTs) are a popular classification and regression method using ensembles of decision trees. More information about the spark.ml implementation can be found further in the section on [GBTs](http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-trees-gbts) For more information on the algorithm itself, please see the[ spark.mllib documentation on GBTs](http://spark.apache.org/docs/latest/mllib-ensembles.html#gradient-boosted-trees-gbts).

Luckily Spark makes very easy to use, basically just an import switch:

In [41]:
import org.apache.spark.ml.classification.GBTClassifier

import org.apache.spark.ml.classification.GBTClassifier


### Train a GBT model

In [43]:
val gbt = new GBTClassifier().setLabelCol("label").setFeaturesCol("features").setMaxIter(10)

gbt: org.apache.spark.ml.classification.GBTClassifier = gbtc_8e098ff08890


### Train model.  This also runs the indexers

In [44]:
val gbt_model = gbt.fit(train_data)

gbt_model: org.apache.spark.ml.classification.GBTClassificationModel = GBTClassificationModel (uid=gbtc_8e098ff08890) with 10 trees


### Make predictions

In [45]:
val gbt_predictions = gbt_model.transform(test_data)

gbt_predictions: org.apache.spark.sql.DataFrame = [label: double, features: vector ... 3 more fields]


In [46]:
gbt_predictions.printSchema

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



### Select example rows to display

In [47]:
gbt_predictions.select("prediction", "label", "features").show(5)

2019-12-31 01:51:42 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2019-12-31 01:51:42 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[98,99,100,1...|
|       0.0|  0.0|(692,[121,122,123...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 5 rows



### Select (prediction, true label) and compute test error

In [48]:
val gbt_evaluator = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("accuracy")

gbt_evaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_6a64abeabadc


In [49]:
val gbt_accuracy = gbt_evaluator.evaluate(gbt_predictions)

gbt_accuracy: Double = 0.9259259259259259


In [51]:
println(f"Gradient Boosted Trees Model Accuracy: ${gbt_accuracy}%1.2f")

Gradient Boosted Trees Model Accuracy: 0.93


In [52]:
println(f"Test Error: ${1-gbt_accuracy}%1.2f")

Test Error: 0.07


### featureImportances

In [53]:
gbt_model.featureImportances

res25: org.apache.spark.ml.linalg.Vector = (692,[155,156,181,183,207,211,215,235,237,262,322,323,351,379,404,406,407,433,462,489,490,626],[4.591074018676939E-17,5.017128242383316E-17,4.983016687194528E-19,8.022656866383189E-17,5.646638104071831E-18,4.114535775480902E-17,1.206348603825323E-16,1.231084505730946E-16,9.295672536354343E-17,8.469957156107742E-17,2.1605054205538538E-17,7.150517335657186E-18,6.072469937675837E-17,1.3217862974229966E-16,1.8702435343095012E-17,0.5999999999999994,2.8233190520359137E-18,0.09999999999999998,0.09999999999999995,0.0999999999999999,0.10000000000000002,2.390686472958985E-18])


So this data isn't really realistic enough to really judge to effectiveness of GBT models, this data makes it seem like they are perfection, instead of just an improvement on normal Random Forests.

### Closing Spark Session

In [54]:
spark.stop()

## Thank You!