In [1]:
import os
import sys

spark_home = os.environ['SPARK_HOME']
sys.path.insert(0, spark_home + "/python/")
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip'))

# Initialize PySpark to predefine the SparkContext variable 'sc'
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 1.4.0
      /_/

Using Python version 2.7.10 (default, May 28 2015 17:04:42)
SparkContext available as sc, HiveContext available as sqlContext.


In [2]:
import csv
import StringIO
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree

def loadRecord(line):
    """Parse a CSV line"""
    input = StringIO.StringIO(line)
    reader = csv.reader(input)
    row = map(float, reader.next())
    return LabeledPoint(row[-1],row[:-1]) 


In [3]:
chf = open('data/CAhousing.csv','r')
header = chf.next().rstrip("\n").split(",")
for i,j in enumerate(header):
    print "%d: %s" % (i,j)

0: longitude
1: latitude
2: housingMedianAge
3: totalRooms
4: totalBedrooms
5: population
6: households
7: medianIncome
8: medianHouseValue


In [4]:
chrdd = sc.parallelize(chf).map(lambda line: loadRecord(line))
chrdd.persist()
chrdd.first()

LabeledPoint(452600.0, [-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252])

In [5]:
(trainingData, testData) = chrdd.randomSplit([0.7, 0.3])

In [9]:
# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                    impurity='variance', minInstancesPerNode=2500)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression tree model:')
print(model.toDebugString())

Test Mean Squared Error = 8316003149.58
Learned regression tree model:
DecisionTreeModel regressor of depth 3 with 7 nodes
  If (feature 7 <= 4.5352)
   If (feature 7 <= 2.9083)
    Predict: 132271.54344714782
   Else (feature 7 > 2.9083)
    If (feature 2 <= 30.0)
     Predict: 181481.92617449665
    Else (feature 2 > 30.0)
     Predict: 216676.41906958862
  Else (feature 7 > 4.5352)
   Predict: 308237.43839960964



each tree will be slightly different because for continuous inputs spark splits on 32 quantiles of a random subsample of data.

In [16]:
dir(model)

['__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__hash__',
 '__init__',
 '__module__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_java_loader_class',
 '_java_model',
 '_load_java',
 '_sc',
 'call',
 'depth',
 'load',
 'numNodes',
 'predict',
 'save',
 'toDebugString']

In [20]:
model.toDebugString

<bound method DecisionTreeModel.toDebugString of DecisionTreeModel regressor of depth 3 with 7 nodes>

In [29]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
forest = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                     numTrees=20, featureSubsetStrategy="all",
                                     impurity='variance', maxDepth=3)


In [32]:
trees = forest._java_model.trees()

In [42]:
print(trees[1].toDebugString())

DecisionTreeModel regressor of depth 3 with 15 nodes
  If (feature 7 <= 4.5478)
   If (feature 7 <= 3.1389)
    If (feature 1 <= 34.43)
     Predict: 157483.16444764304
    Else (feature 1 > 34.43)
     Predict: 116794.82935841451
   Else (feature 7 > 3.1389)
    If (feature 2 <= 37.0)
     Predict: 188058.36222732492
    Else (feature 2 > 37.0)
     Predict: 249606.74832535884
  Else (feature 7 > 4.5478)
   If (feature 7 <= 6.2083)
    If (feature 2 <= 26.0)
     Predict: 241303.35923653716
    Else (feature 2 > 26.0)
     Predict: 295064.62611516623
   Else (feature 7 > 6.2083)
    If (feature 2 <= 25.0)
     Predict: 357900.286908078
    Else (feature 2 > 25.0)
     Predict: 439401.8522895126



In [41]:
for i in range(20):
    with open("data/tree%d.txt"%i, "w") as f:
        f.write(trees[i].toDebugString())