In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").appName("SparkMLLib_DecisionTree").getOrCreate()

24/11/15 13:28:04 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
from __future__ import print_function

from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Load the data stored in LIBSVM format as a DataFrame.
data = spark.read.format("libsvm").load("./sample_libsvm_data.txt")

data.show()

data.count()

24/11/15 13:34:59 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.
                                                                                

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



100

In [3]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

                                                                                

### StringIndexer de chuyen doi cac gia tri chuoi trong cot "label" thanh cac gia tri so. "indexedLabel" la ten cot sau khi chuyen doi. Phuong thuc fit se huan luyen StringIndexer dua tren DataFrame "data"
### VectorIndexer la cong cu giup xu ly cac features vector khi trong vector do co gia tri phan loai. Vi du tren cot "features" se duoc xu ly va cac gia tri se duoc chuyen doi sang cot moi la "indexedFeatures". maxCategories la tham so gioi han gia tri phan loai ma VectorIndexer se coi la phan loai. Phuong thuc fit se huan luyen VectorIndexer dua tren DataFrame "data"

In [None]:
data.show()

data.count()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



100

In [4]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

                                                                                

### Chia data mau thanh 2 mau train va test voi ti le 70:30
### Bien dt de tao 1 bien cua Cay quyet dinh voi cot label la "IndexedLabel" va cot features la "indexedFeatures"
### pipeline giup to chuc quy trinh huan luyen mo hinh theo 1 cach tuan tu. Cac buoc trong pipeline: ma hoa label -> ma hoa features -> huan luyen theo mo hinh Cay quyet dinh

In [5]:
# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show()

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))


+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         1.0|(692,[100,101,102...|
|       1.0|         1.0|(692,[123,124,125...|
|       1.0|         1.0|(692,[124,125,126...|
|       1.0|         1.0|(692,[124,125,126...|
|       1.0|         1.0|(692,[126,127,128...|
|       1.0|         1.0|(692,[126,127,128...|
|       1.0|         1.0|(692,[127,128,129...|
|       1.0|         1.0|(692,[127,128,129...|
|       1.0|         1.0|(692,[129,130,131...|
|       1.0|         1.0|(692,[152,153,154...|
|       0.0|         1.0|(692,[152,153,154...|
|       1.0|         1.0|(692,[153,154,155...|
|       0.0|         1.0|(692,[154,155,156...|
|       1.0|         1.0|(692,[181,182,183...|
|       1.0|         1.0|(692,[234,235,237...|
|       0.0|         0.0|(692,[119,120,121...|
|       0.0|         0.0|(692,[123,124,125...|
|       0.0|         0.0|(692,[123,124,125...|
|       0.0| 

                                                                                

### predictions la de dua ra du doan cua mo hinh dua tren test data, sau do show ra cac cot du doan, label va feature
### MulticlassClassificationEvaluator la cong cu dung de danh gia mo hinh phan loai da lop trong Spark. Phuong thuc evaluate() tinh toan chi so duoc chi dinh - o day la accuracy la do chinh xac. Va o duoi la sai so cua bo test