In [14]:
from __future__ import print_function
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# $example off$
from pyspark.sql import SparkSession

In [15]:
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("DecisionTreeClassificationExample")\
        .getOrCreate()

In [16]:
    # $example on$
    # Load the data stored in LIBSVM format as a DataFrame.
    data = spark.read.load("Social_Network_Ads.csv",format="csv", sep=",", inferSchema="true", header="true")
    data.show()

+--------+------+---+---------------+---------+
| User ID|Gender|Age|EstimatedSalary|Purchased|
+--------+------+---+---------------+---------+
|15624510|  Male| 19|          19000|        0|
|15810944|  Male| 35|          20000|        0|
|15668575|Female| 26|          43000|        0|
|15603246|Female| 27|          57000|        0|
|15804002|  Male| 19|          76000|        0|
|15728773|  Male| 27|          58000|        0|
|15598044|Female| 27|          84000|        0|
|15694829|Female| 32|         150000|        1|
|15600575|  Male| 25|          33000|        0|
|15727311|Female| 35|          65000|        0|
|15570769|Female| 26|          80000|        0|
|15606274|Female| 26|          52000|        0|
|15746139|  Male| 20|          86000|        0|
|15704987|  Male| 32|          18000|        0|
|15628972|  Male| 18|          82000|        0|
|15697686|  Male| 29|          80000|        0|
|15733883|  Male| 47|          25000|        1|
|15617482|  Male| 45|          26000|   

In [17]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="Gender", outputCol="GenderIndex")
data = indexer.fit(data).transform(data)
data = data.select("Age","EstimatedSalary","GenderIndex","Purchased")
data.show()

+---+---------------+-----------+---------+
|Age|EstimatedSalary|GenderIndex|Purchased|
+---+---------------+-----------+---------+
| 19|          19000|        1.0|        0|
| 35|          20000|        1.0|        0|
| 26|          43000|        0.0|        0|
| 27|          57000|        0.0|        0|
| 19|          76000|        1.0|        0|
| 27|          58000|        1.0|        0|
| 27|          84000|        0.0|        0|
| 32|         150000|        0.0|        1|
| 25|          33000|        1.0|        0|
| 35|          65000|        0.0|        0|
| 26|          80000|        0.0|        0|
| 26|          52000|        0.0|        0|
| 20|          86000|        1.0|        0|
| 32|          18000|        1.0|        0|
| 18|          82000|        1.0|        0|
| 29|          80000|        1.0|        0|
| 47|          25000|        1.0|        1|
| 45|          26000|        1.0|        1|
| 46|          28000|        1.0|        1|
| 48|          29000|        0.0

In [18]:
from pyspark.ml.feature import OneHotEncoderEstimator
encoder = OneHotEncoderEstimator(inputCols=["GenderIndex"],
                                 outputCols=["Gender"])
model = encoder.fit(data)
data = model.transform(data)
data = data.select("Age","EstimatedSalary","Gender","Purchased")

data.show()

+---+---------------+-------------+---------+
|Age|EstimatedSalary|       Gender|Purchased|
+---+---------------+-------------+---------+
| 19|          19000|    (1,[],[])|        0|
| 35|          20000|    (1,[],[])|        0|
| 26|          43000|(1,[0],[1.0])|        0|
| 27|          57000|(1,[0],[1.0])|        0|
| 19|          76000|    (1,[],[])|        0|
| 27|          58000|    (1,[],[])|        0|
| 27|          84000|(1,[0],[1.0])|        0|
| 32|         150000|(1,[0],[1.0])|        1|
| 25|          33000|    (1,[],[])|        0|
| 35|          65000|(1,[0],[1.0])|        0|
| 26|          80000|(1,[0],[1.0])|        0|
| 26|          52000|(1,[0],[1.0])|        0|
| 20|          86000|    (1,[],[])|        0|
| 32|          18000|    (1,[],[])|        0|
| 18|          82000|    (1,[],[])|        0|
| 29|          80000|    (1,[],[])|        0|
| 47|          25000|    (1,[],[])|        1|
| 45|          26000|    (1,[],[])|        1|
| 46|          28000|    (1,[],[])

In [19]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=[ "Age", "EstimatedSalary","Gender"],
    outputCol="features")

data = assembler.transform(data)
data=data.select("features", "Purchased")


In [20]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors
scaler = MinMaxScaler(inputCol="features", outputCol="featuresScaled")

# Compute summary statistics and generate MinMaxScalerModel
scalerModel = scaler.fit(data)

# rescale each feature to range [min, max].
data = scalerModel.transform(data)
data = data.selectExpr("featuresScaled as features", "Purchased as label")
data.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.02380952380952...|    0|
|[0.40476190476190...|    0|
|[0.19047619047619...|    0|
|[0.21428571428571...|    0|
|[0.02380952380952...|    0|
|[0.21428571428571...|    0|
|[0.21428571428571...|    0|
|[0.33333333333333...|    1|
|[0.16666666666666...|    0|
|[0.40476190476190...|    0|
|[0.19047619047619...|    0|
|[0.19047619047619...|    0|
|[0.04761904761904...|    0|
|[0.33333333333333...|    0|
|[0.0,0.4962962962...|    0|
|[0.26190476190476...|    0|
|[0.69047619047619...|    1|
|[0.64285714285714...|    1|
|[0.66666666666666...|    1|
|[0.71428571428571...|    1|
+--------------------+-----+
only showing top 20 rows



In [21]:
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

In [22]:
    # Train model.  This also runs the indexers.
    model = dt.fit(trainingData)

In [23]:
    # Make predictions.
    predictions = model.transform(testData)
    

In [24]:
    # Select example rows to display.
    predictions.select("prediction", "label", "features").show(5)
    
    

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|    0|[0.0,0.4962962962...|
|       0.0|    0|[0.02380952380952...|
|       0.0|    0|[0.02380952380952...|
|       0.0|    0|[0.04761904761904...|
|       0.0|    0|[0.04761904761904...|
+----------+-----+--------------------+
only showing top 5 rows



In [25]:
    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

Test Error = 0.0888889 


In [13]:
    treeModel = model.stages[2]
    # summary only
    print(treeModel)
    # $example off$

    spark.stop()

AttributeError: 'DecisionTreeClassificationModel' object has no attribute 'stages'