In [46]:
from __future__ import print_function

# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# $example off$
from pyspark.sql import SparkSession

In [47]:
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("DecisionTreeClassificationExample")\
        .getOrCreate()

In [48]:
    # $example on$
    # Load the data stored in LIBSVM format as a DataFrame.
    data = spark.read.load("Social_Network_Ads.csv",format="csv", sep=",", inferSchema="true", header="true")
    data.show()

+--------+------+---+---------------+-----+
| User ID|Gender|Age|EstimatedSalary|label|
+--------+------+---+---------------+-----+
|15624510|  Male| 19|          19000|    0|
|15810944|  Male| 35|          20000|    0|
|15668575|Female| 26|          43000|    0|
|15603246|Female| 27|          57000|    0|
|15804002|  Male| 19|          76000|    0|
|15728773|  Male| 27|          58000|    0|
|15598044|Female| 27|          84000|    0|
|15694829|Female| 32|         150000|    1|
|15600575|  Male| 25|          33000|    0|
|15727311|Female| 35|          65000|    0|
|15570769|Female| 26|          80000|    0|
|15606274|Female| 26|          52000|    0|
|15746139|  Male| 20|          86000|    0|
|15704987|  Male| 32|          18000|    0|
|15628972|  Male| 18|          82000|    0|
|15697686|  Male| 29|          80000|    0|
|15733883|  Male| 47|          25000|    1|
|15617482|  Male| 45|          26000|    1|
|15704583|  Male| 46|          28000|    1|
|15621083|Female| 48|          2

In [49]:
from pyspark.ml.feature import StringIndexer



indexer = StringIndexer(inputCol="Gender", outputCol="GenderIndex")
data = indexer.fit(data).transform(data)
data.show()

+--------+------+---+---------------+-----+-----------+
| User ID|Gender|Age|EstimatedSalary|label|GenderIndex|
+--------+------+---+---------------+-----+-----------+
|15624510|  Male| 19|          19000|    0|        1.0|
|15810944|  Male| 35|          20000|    0|        1.0|
|15668575|Female| 26|          43000|    0|        0.0|
|15603246|Female| 27|          57000|    0|        0.0|
|15804002|  Male| 19|          76000|    0|        1.0|
|15728773|  Male| 27|          58000|    0|        1.0|
|15598044|Female| 27|          84000|    0|        0.0|
|15694829|Female| 32|         150000|    1|        0.0|
|15600575|  Male| 25|          33000|    0|        1.0|
|15727311|Female| 35|          65000|    0|        0.0|
|15570769|Female| 26|          80000|    0|        0.0|
|15606274|Female| 26|          52000|    0|        0.0|
|15746139|  Male| 20|          86000|    0|        1.0|
|15704987|  Male| 32|          18000|    0|        1.0|
|15628972|  Male| 18|          82000|    0|     

In [50]:
from pyspark.ml.feature import OneHotEncoderEstimator
encoder = OneHotEncoderEstimator(inputCols=["GenderIndex"],
                                 outputCols=["GenderHC"])
model = encoder.fit(data)
data = model.transform(data)
data.show()

+--------+------+---+---------------+-----+-----------+-------------+
| User ID|Gender|Age|EstimatedSalary|label|GenderIndex|     GenderHC|
+--------+------+---+---------------+-----+-----------+-------------+
|15624510|  Male| 19|          19000|    0|        1.0|    (1,[],[])|
|15810944|  Male| 35|          20000|    0|        1.0|    (1,[],[])|
|15668575|Female| 26|          43000|    0|        0.0|(1,[0],[1.0])|
|15603246|Female| 27|          57000|    0|        0.0|(1,[0],[1.0])|
|15804002|  Male| 19|          76000|    0|        1.0|    (1,[],[])|
|15728773|  Male| 27|          58000|    0|        1.0|    (1,[],[])|
|15598044|Female| 27|          84000|    0|        0.0|(1,[0],[1.0])|
|15694829|Female| 32|         150000|    1|        0.0|(1,[0],[1.0])|
|15600575|  Male| 25|          33000|    0|        1.0|    (1,[],[])|
|15727311|Female| 35|          65000|    0|        0.0|(1,[0],[1.0])|
|15570769|Female| 26|          80000|    0|        0.0|(1,[0],[1.0])|
|15606274|Female| 26

In [51]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler



assembler = VectorAssembler(
    inputCols=["GenderHC", "Age", "EstimatedSalary"],
    outputCol="features")

data = assembler.transform(data)

data.select("features", "label").show(truncate=False)

+-------------------+-----+
|features           |label|
+-------------------+-----+
|[0.0,19.0,19000.0] |0    |
|[0.0,35.0,20000.0] |0    |
|[1.0,26.0,43000.0] |0    |
|[1.0,27.0,57000.0] |0    |
|[0.0,19.0,76000.0] |0    |
|[0.0,27.0,58000.0] |0    |
|[1.0,27.0,84000.0] |0    |
|[1.0,32.0,150000.0]|1    |
|[0.0,25.0,33000.0] |0    |
|[1.0,35.0,65000.0] |0    |
|[1.0,26.0,80000.0] |0    |
|[1.0,26.0,52000.0] |0    |
|[0.0,20.0,86000.0] |0    |
|[0.0,32.0,18000.0] |0    |
|[0.0,18.0,82000.0] |0    |
|[0.0,29.0,80000.0] |0    |
|[0.0,47.0,25000.0] |1    |
|[0.0,45.0,26000.0] |1    |
|[0.0,46.0,28000.0] |1    |
|[1.0,48.0,29000.0] |1    |
+-------------------+-----+
only showing top 20 rows



In [52]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors
scaler = MinMaxScaler(inputCol="features", outputCol="featuresScaled")

# Compute summary statistics and generate MinMaxScalerModel
scalerModel = scaler.fit(data)

# rescale each feature to range [min, max].
data = scalerModel.transform(data)
data.select( "featuresScaled","label").show()

+--------------------+-----+
|      featuresScaled|label|
+--------------------+-----+
|[0.0,0.0238095238...|    0|
|[0.0,0.4047619047...|    0|
|[1.0,0.1904761904...|    0|
|[1.0,0.2142857142...|    0|
|[0.0,0.0238095238...|    0|
|[0.0,0.2142857142...|    0|
|[1.0,0.2142857142...|    0|
|[1.0,0.3333333333...|    1|
|[0.0,0.1666666666...|    0|
|[1.0,0.4047619047...|    0|
|[1.0,0.1904761904...|    0|
|[1.0,0.1904761904...|    0|
|[0.0,0.0476190476...|    0|
|[0.0,0.3333333333...|    0|
|[0.0,0.0,0.496296...|    0|
|[0.0,0.2619047619...|    0|
|[0.0,0.6904761904...|    1|
|[0.0,0.6428571428...|    1|
|[0.0,0.6666666666...|    1|
|[1.0,0.7142857142...|    1|
+--------------------+-----+
only showing top 20 rows



In [16]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
    inputCols=["Gendere", "Age", "EstimatedSalary"],
    outputCol="features")
output = assembler.transform(data)
output.show()

IllegalArgumentException: 'Data type string of column Gender is not supported.'

In [13]:
    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
    # Automatically identify categorical features, and index them.
    # We specify maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

IllegalArgumentException: 'Field "features" does not exist.\nAvailable fields: User ID, Gender, Age, EstimatedSalary, label'

In [None]:
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

In [None]:
    # Chain indexers and tree in a Pipeline
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

In [None]:
    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select("prediction", "indexedLabel", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

In [2]:
    treeModel = model.stages[2]
    # summary only
    print(treeModel)
    # $example off$

    spark.stop()

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_12d2ffb98c1d) of depth 2 with 5 nodes
