In [2]:
from pyspark.sql import  SQLContext
sqlContext=SQLContext(sc)

In [3]:
# IPYTHON_OPTS='notebook --no-browser' /Users/Tianqi/spark-1.6.1-bin-hadoop2.6/bin/pyspark --packages com.databricks:spark-csv_2.10:1.4.0
#https://archive.ics.uci.edu/ml/dataset/adult

df = sqlContext.read \
               .format("com.databricks.spark.csv") \
               .options(header="true", inferschema="true") \
               .load("adult.csv")

In [4]:
df.columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'maritial-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'category']

In [5]:
df.show()

+---+----------------+------+------------+-------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+--------+
|age|       workclass|fnlwgt|   education|education-num|     maritial-status|       occupation| relationship|              race|   sex|capital-gain|capital-loss|hours-per-week|native-country|category|
+---+----------------+------+------------+-------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+--------+
| 39|       State-gov| 77516|   Bachelors|           13|       Never-married|     Adm-clerical|Not-in-family|             White|  Male|        2174|           0|            40| United-States|   <=50K|
| 50|Self-emp-not-inc| 83311|   Bachelors|           13|  Married-civ-spouse|  Exec-managerial|      Husband|             White|  Male|           0|           0|            13| United-States|   <=

In [6]:
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.param import Param, Params
from pyspark.ml.feature import Bucketizer, VectorAssembler, StringIndexer, IndexToString
from pyspark.ml import Pipeline

In [7]:
# Combines a list of double input features into a vector
assembler = VectorAssembler(inputCols=["age", "education-num"], outputCol="features")

# String indexer converts a set of strings into doubles
indexer = StringIndexer(inputCol="category", outputCol="category-index").fit(df)

# Specify model
dt = DecisionTreeClassifier(labelCol = "category-index", featuresCol="features")

# Can be used to combine pipeline components together
pipeline = Pipeline(stages=[assembler, indexer, dt])

In [8]:
(trainingData, testData) = df.randomSplit([0.7, 0.3])

In [9]:
# train the model on the training data
model = pipeline.fit(trainingData)

# Run the model also on the training data (for evaluation purposes)
training = model.transform(trainingData)

# Run the model on the testing data for predictions
predictions = model.transform(testData)

In [10]:
trainingData

DataFrame[age: int, workclass: string, fnlwgt: int, education: string, education-num: int, maritial-status: string, occupation: string, relationship: string, race: string, sex: string, capital-gain: int, capital-loss: int, hours-per-week: int, native-country: string, category: string]

In [12]:
training

DataFrame[age: int, workclass: string, fnlwgt: int, education: string, education-num: int, maritial-status: string, occupation: string, relationship: string, race: string, sex: string, capital-gain: int, capital-loss: int, hours-per-week: int, native-country: string, category: string, features: vector, category-index: double, rawPrediction: vector, probability: vector, prediction: double]

In [13]:
predictions

DataFrame[age: int, workclass: string, fnlwgt: int, education: string, education-num: int, maritial-status: string, occupation: string, relationship: string, race: string, sex: string, capital-gain: int, capital-loss: int, hours-per-week: int, native-country: string, category: string, features: vector, category-index: double, rawPrediction: vector, probability: vector, prediction: double]

In [19]:
predictions.select('features', 'category', 'category-index', 'prediction').show(20)

+----------+--------+--------------+----------+
|  features|category|category-index|prediction|
+----------+--------+--------------+----------+
|[17.0,7.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|       0.0|
|[17.0,7.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|       0.0|
|[17.0,7.0]|   <=50K|           0.0|       0.0|
|[17.0,7.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|       0.0|
|[17.0,7.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|       0.0|
|[17.0,7.0]|   <=50K|           0.0|       0.0|
|[17.0,7.0]|   <=50K|           0.0|       0.0|
|[17.0,7.0]|   <=50K|           0.0|       0.0|
|[17.0,7.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|       0.0|
|[17.0,8.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|    

In [21]:
model.stages[1].labels

[u'<=50K', u'>50K']

In [22]:
labels = list(model.stages[1].labels)
inverter = IndexToString(inputCol="prediction", outputCol="prediction-label", labels=labels)
inverter.transform(model.transform(testData)).select("prediction-label", "category").take(5)

[Row(prediction-label=u'<=50K', category=u'<=50K'),
 Row(prediction-label=u'<=50K', category=u'<=50K'),
 Row(prediction-label=u'<=50K', category=u'<=50K'),
 Row(prediction-label=u'<=50K', category=u'<=50K'),
 Row(prediction-label=u'<=50K', category=u'<=50K')]

In [29]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="category-index", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print "Test Error = %g" % (1.0 - accuracy)

Test Error = 0.212084


In [30]:
accuracy

0.7879155299585733

In [31]:
treeModel = model.stages[2]
print treeModel

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4211bf6b546b32a954fa) of depth 5 with 61 nodes


In [33]:
pf = training.withColumn('match', training['category-index']==training['prediction'])

In [26]:
training.groupBy(training['category-index'], training.prediction).count().collect()

[Row(category-index=1.0, prediction=1.0, count=2099),
 Row(category-index=0.0, prediction=0.0, count=16005),
 Row(category-index=0.0, prediction=1.0, count=1349),
 Row(category-index=1.0, prediction=0.0, count=3404)]