In [1]:
# Boiler Plate
import findspark
findspark.init('/home/michael/spark-2.1.0-bin-hadoop2.7/')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('mytree').getOrCreate()

In [2]:
from pyspark.ml import Pipeline

In [8]:
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier

In [5]:
data = spark.read.format('libsvm').load('sample_libsvm_data.txt')

In [6]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [7]:
train_data, test_data = data.randomSplit([0.7,0.3])

In [9]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [10]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [11]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [12]:
dtc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[100,101,102...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[125,126,127...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[152,153,154...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[152,153,154...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[153,154,155...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[154,155,156...|   [0.0,42.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[181,182,183...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  1.0|(692,[123,124,125...|   [0.0,42.0]|  [0.0,1.0]|       1.0|
|  1.0|(692,[124,125,126...|   [0.0,42.0]|  [0.0,1.0]|       1.0|
|  1.0|(69

Using an evaluator

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [14]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [15]:
print("DTC ACCURACY:")
acc_eval.evaluate(dtc_preds)

DTC ACCURACY:


0.9583333333333334

In [16]:
print("RFC ACCURACY:")
acc_eval.evaluate(rfc_preds)

RFC ACCURACY:


1.0

Realistically, these numbers won't be so high

In [17]:
# Higher the number, the more important the factor
rfc_model.featureImportances

SparseVector(692, {98: 0.0004, 147: 0.0005, 151: 0.0012, 152: 0.0004, 154: 0.0004, 156: 0.0026, 177: 0.001, 184: 0.0022, 186: 0.0005, 188: 0.0002, 210: 0.0001, 211: 0.0005, 215: 0.0007, 216: 0.0072, 234: 0.002, 241: 0.0005, 243: 0.0015, 244: 0.001, 263: 0.0006, 268: 0.0009, 270: 0.0003, 271: 0.0077, 272: 0.0354, 274: 0.0061, 288: 0.0063, 291: 0.0075, 293: 0.0005, 300: 0.0216, 301: 0.0141, 316: 0.0005, 317: 0.0147, 323: 0.0006, 325: 0.0017, 328: 0.0023, 329: 0.0084, 330: 0.0074, 342: 0.0034, 343: 0.0006, 345: 0.0085, 348: 0.0003, 350: 0.0082, 351: 0.0023, 352: 0.0083, 354: 0.0031, 357: 0.0036, 358: 0.0057, 359: 0.0033, 360: 0.0036, 370: 0.0022, 373: 0.0067, 374: 0.0005, 376: 0.0007, 377: 0.0071, 378: 0.01, 379: 0.0426, 380: 0.0031, 385: 0.0092, 386: 0.0009, 387: 0.0013, 397: 0.0023, 399: 0.0091, 400: 0.0066, 401: 0.0074, 403: 0.0006, 404: 0.0004, 406: 0.0408, 407: 0.0324, 408: 0.0005, 411: 0.0006, 412: 0.0064, 428: 0.0077, 429: 0.0123, 433: 0.0375, 434: 0.0472, 435: 0.0152, 437: 0.0012,

This type of modeling structure works for pretty much any model in MLlib