* Split data into train and test

In [1]:
inputDF=spark.createDataFrame([
    ("Positive",[0.0,1.1,0.1]), ("Negative",[0.1,1.2,0.2]), ("Positive",[0.0,1.3,0.3]), ("Negative",[0.3,1.4,0.4]), ("Positive",[0.0,1.4,0.5])
],["categoricalLabel","features"])
print(inputDF.count())

trainValidation,test = inputDF.randomSplit([0.75,0.25])
print(trainValidation.count())
print(test.count())
print(trainValidation.count()+test.count())

5
3
2
5


* **Classification**

All the classification algorithms available in Spark work only on numerical attributes

--Decision tree

In [6]:
data= spark.createDataFrame([
    (15000,1100,"Low"),
    (0,5000,"High"),
    (20000,800,"High"),
    (6000,1300,"Low"),
    (50000,2500,"Low"),
    (2000,1100,"Low"),
    (700,1500,"High"),
    (75000,0,"High"),
    (4000,500,"High")
],["Savings","Income","Risk"])
data.show()
testData= spark.createDataFrame([
    (100000,10000,"Low"),
    (100,100,"High"),
    (3100,900,"High"),
    (2000,1500,"High"),
    (3500,1200,"Low")
],["Savings","Income","Risk"])
testData.show()

+-------+------+----+
|Savings|Income|Risk|
+-------+------+----+
|  15000|  1100| Low|
|      0|  5000|High|
|  20000|   800|High|
|   6000|  1300| Low|
|  50000|  2500| Low|
|   2000|  1100| Low|
|    700|  1500|High|
|  75000|     0|High|
|   4000|   500|High|
+-------+------+----+

+-------+------+----+
|Savings|Income|Risk|
+-------+------+----+
| 100000| 10000| Low|
|    100|   100|High|
|   3100|   900|High|
|   2000|  1500|High|
|   3500|  1200| Low|
+-------+------+----+



In [4]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
indexer = StringIndexer(inputCol="Risk", outputCol="RiskIndex",
                        handleInvalid="keep")
indexerModel = indexer.fit(data)
indexedDF=indexerModel.transform(data)

va=VectorAssembler(inputCols=["Savings","Income"],
                    outputCol="features")
processedDF=va.transform(indexedDF)
processedDF.show()

+-------+------+----+---------+----------------+
|Savings|Income|Risk|RiskIndex|        features|
+-------+------+----+---------+----------------+
|  15000|  1100| Low|      1.0|[15000.0,1100.0]|
|      0|  5000|High|      0.0|    [0.0,5000.0]|
|  20000|   800|High|      0.0| [20000.0,800.0]|
|   6000|  1300| Low|      1.0| [6000.0,1300.0]|
|  50000|  2500| Low|      1.0|[50000.0,2500.0]|
|   2000|  1100| Low|      1.0| [2000.0,1100.0]|
|    700|  1500|High|      0.0|  [700.0,1500.0]|
|  75000|     0|High|      0.0|   [75000.0,0.0]|
|   4000|   500|High|      0.0|  [4000.0,500.0]|
+-------+------+----+---------+----------------+



In [5]:
from pyspark.ml.classification import DecisionTreeClassifier
# Train a DecisionTree model
dt = DecisionTreeClassifier(labelCol="RiskIndex",
                            featuresCol="features")

dtModel=dt.fit(processedDF)
finalDF=dtModel.transform(processedDF)
finalDF.show()

+-------+------+----+---------+----------------+-------------+-------------+----------+
|Savings|Income|Risk|RiskIndex|        features|rawPrediction|  probability|prediction|
+-------+------+----+---------+----------------+-------------+-------------+----------+
|  15000|  1100| Low|      1.0|[15000.0,1100.0]|[0.0,4.0,0.0]|[0.0,1.0,0.0]|       1.0|
|      0|  5000|High|      0.0|    [0.0,5000.0]|[2.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|  20000|   800|High|      0.0| [20000.0,800.0]|[3.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|   6000|  1300| Low|      1.0| [6000.0,1300.0]|[0.0,4.0,0.0]|[0.0,1.0,0.0]|       1.0|
|  50000|  2500| Low|      1.0|[50000.0,2500.0]|[0.0,4.0,0.0]|[0.0,1.0,0.0]|       1.0|
|   2000|  1100| Low|      1.0| [2000.0,1100.0]|[0.0,4.0,0.0]|[0.0,1.0,0.0]|       1.0|
|    700|  1500|High|      0.0|  [700.0,1500.0]|[2.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|  75000|     0|High|      0.0|   [75000.0,0.0]|[3.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|   4000|   500|High|      0.0| 

In [16]:
processedTestDF=va.transform(testData)
finalTestDF=dtModel.transform(processedTestDF)
finalTestDF.show()

+-------+------+----+------------------+-------------+-------------+----------+
|Savings|Income|Risk|          features|rawPrediction|  probability|prediction|
+-------+------+----+------------------+-------------+-------------+----------+
| 100000| 10000| Low|[100000.0,10000.0]|[0.0,4.0,0.0]|[0.0,1.0,0.0]|       1.0|
|    100|   100|High|     [100.0,100.0]|[3.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|   3100|   900|High|    [3100.0,900.0]|[3.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|   2000|  1500|High|   [2000.0,1500.0]|[0.0,4.0,0.0]|[0.0,1.0,0.0]|       1.0|
|   3500|  1200| Low|   [3500.0,1200.0]|[0.0,4.0,0.0]|[0.0,1.0,0.0]|       1.0|
+-------+------+----+------------------+-------------+-------------+----------+



--Random Forest

In [10]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="RiskIndex",
                            featuresCol="features",numTrees=20)
rfModel=rf.fit(processedDF)
finalDF=rfModel.transform(processedDF)
finalDF.show()

+-------+------+----+---------+----------------+--------------+---------------+----------+
|Savings|Income|Risk|RiskIndex|        features| rawPrediction|    probability|prediction|
+-------+------+----+---------+----------------+--------------+---------------+----------+
|  15000|  1100| Low|      1.0|[15000.0,1100.0]|[2.0,18.0,0.0]|  [0.1,0.9,0.0]|       1.0|
|      0|  5000|High|      0.0|    [0.0,5000.0]|[18.0,2.0,0.0]|  [0.9,0.1,0.0]|       0.0|
|  20000|   800|High|      0.0| [20000.0,800.0]|[14.0,6.0,0.0]|  [0.7,0.3,0.0]|       0.0|
|   6000|  1300| Low|      1.0| [6000.0,1300.0]|[1.0,19.0,0.0]|[0.05,0.95,0.0]|       1.0|
|  50000|  2500| Low|      1.0|[50000.0,2500.0]|[2.0,18.0,0.0]|  [0.1,0.9,0.0]|       1.0|
|   2000|  1100| Low|      1.0| [2000.0,1100.0]|[2.0,18.0,0.0]|  [0.1,0.9,0.0]|       1.0|
|    700|  1500|High|      0.0|  [700.0,1500.0]|[13.0,7.0,0.0]|[0.65,0.35,0.0]|       0.0|
|  75000|     0|High|      0.0|   [75000.0,0.0]|[17.0,3.0,0.0]|[0.85,0.15,0.0]|       0.0|

In [11]:
processedTestDF=va.transform(testData)
finalTestDF=rfModel.transform(processedTestDF)
finalTestDF.show()

+-------+------+----+------------------+--------------+---------------+----------+
|Savings|Income|Risk|          features| rawPrediction|    probability|prediction|
+-------+------+----+------------------+--------------+---------------+----------+
| 100000| 10000| Low|[100000.0,10000.0]|[5.0,15.0,0.0]|[0.25,0.75,0.0]|       1.0|
|    100|   100|High|     [100.0,100.0]|[19.0,1.0,0.0]|[0.95,0.05,0.0]|       0.0|
|   3100|   900|High|    [3100.0,900.0]|[15.0,5.0,0.0]|[0.75,0.25,0.0]|       0.0|
|   2000|  1500|High|   [2000.0,1500.0]|[2.0,18.0,0.0]|  [0.1,0.9,0.0]|       1.0|
|   3500|  1200| Low|   [3500.0,1200.0]|[4.0,16.0,0.0]|  [0.2,0.8,0.0]|       1.0|
+-------+------+----+------------------+--------------+---------------+----------+



-- Neural Network

In [12]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
# specify layers for the neural network:
# input layer of size 2 (features), one intermediate of size 4
# and output of size 3 (classes -> Low, High, Other)
layers = [2, 4, 3]
# create the trainer and set its parameters
nn = MultilayerPerceptronClassifier(labelCol="RiskIndex",
    featuresCol="features",maxIter=200, layers=layers, blockSize=128,seed=1234)
# train the model
nnModel = nn.fit(processedDF)
resultDF = nnModel.transform(processedDF)
resultDF.show()


+-------+------+----+---------+----------------+--------------------+--------------------+----------+
|Savings|Income|Risk|RiskIndex|        features|       rawPrediction|         probability|prediction|
+-------+------+----+---------+----------------+--------------------+--------------------+----------+
|  15000|  1100| Low|      1.0|[15000.0,1100.0]|[11.6679902396902...|[0.49999998466724...|       1.0|
|      0|  5000|High|      0.0|    [0.0,5000.0]|[20.3035949416111...|[0.99999999999142...|       0.0|
|  20000|   800|High|      0.0| [20000.0,800.0]|[11.6679902396902...|[0.49999998466724...|       1.0|
|   6000|  1300| Low|      1.0| [6000.0,1300.0]|[11.6679902396902...|[0.49999998466724...|       1.0|
|  50000|  2500| Low|      1.0|[50000.0,2500.0]|[11.6679902396902...|[0.49999998466724...|       1.0|
|   2000|  1100| Low|      1.0| [2000.0,1100.0]|[3.80761564877737...|[8.57550325185138...|       1.0|
|    700|  1500|High|      0.0|  [700.0,1500.0]|[25.7779746236880...|[0.9999999999

In [19]:
testDataIndexed=indexerModel.transform(testData)
processedTestDF=va.transform(testDataIndexed)
finalTestDF=nnModel.transform(processedTestDF)
finalTestDF.show()

+-------+------+----+---------+------------------+--------------------+--------------------+----------+
|Savings|Income|Risk|RiskIndex|          features|       rawPrediction|         probability|prediction|
+-------+------+----+---------+------------------+--------------------+--------------------+----------+
| 100000| 10000| Low|      1.0|[100000.0,10000.0]|[11.6679902396902...|[0.49999998466724...|       1.0|
|    100|   100|High|      0.0|     [100.0,100.0]|[3.80761565900288...|[8.57550343928538...|       1.0|
|   3100|   900|High|      0.0|    [3100.0,900.0]|[11.6679902396902...|[0.49999998466724...|       1.0|
|   2000|  1500|High|      0.0|   [2000.0,1500.0]|[3.80761564877737...|[8.57550325185138...|       1.0|
|   3500|  1200| Low|      1.0|   [3500.0,1200.0]|[11.6679902396902...|[0.49999998466724...|       1.0|
+-------+------+----+---------+------------------+--------------------+--------------------+----------+



* Performance evalutaion

In [20]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
myEvaluator1 =MulticlassClassificationEvaluator(labelCol="RiskIndex",predictionCol="prediction",metricName='accuracy')
myEvaluator2 =MulticlassClassificationEvaluator(labelCol="RiskIndex",predictionCol="prediction",metricName='weightedPrecision')
print("Accuracy on training is ", myEvaluator1.evaluate(finalDF))
print("Weighted precision on training is ", myEvaluator2.evaluate(finalDF))
print("Accuracy on test is ", myEvaluator1.evaluate(finalTestDF))
print("Weighted precision on test is ", myEvaluator2.evaluate(finalTestDF))

Accuracy on training is  1.0
Weighted precision on training is  1.0
Accuracy on test is  0.4
Weighted precision on test is  0.16


In [21]:
#with RDD apis
from pyspark.mllib.evaluation import MulticlassMetrics
outRDD=finalTestDF.select("prediction","RiskIndex").rdd.map(lambda x: (float(x[0]),float(x[1])))
metrics=MulticlassMetrics(outRDD)

# Overall statistics
precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)
# Statistics by class
labels = outRDD.map(lambda lp: lp[1]).distinct().collect()
for label in sorted(labels):
 print("Class %s precision = %s" % (label, metrics.precision(label)))
 print("Class %s recall = %s" % (label, metrics.recall(label)))
 print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
# Weighted stats
print("Weighted recall = %s" % metrics.weightedRecall)
print("Weighted precision = %s" % metrics.weightedPrecision)
print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

Summary Stats
Precision = 0.4
Recall = 0.4
F1 Score = 0.4
Class 0.0 precision = 0.0
Class 0.0 recall = 0.0
Class 0.0 F1 Measure = 0.0
Class 1.0 precision = 0.4
Class 1.0 recall = 1.0
Class 1.0 F1 Measure = 0.5714285714285715
Weighted recall = 0.4
Weighted precision = 0.16
Weighted F(1) Score = 0.2285714285714286
Weighted F(0.5) Score = 0.18181818181818182
Weighted false positive rate = 0.4


* Parameter Tuning

In [24]:
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="RiskIndex", featuresCol="features")
paramGrid = ParamGridBuilder()\
 .addGrid(dt.maxDepth, [1,2,10]) \
 .addGrid(dt.impurity, ["Gini","Entropy"])\
 .build()
myEvaluator =MulticlassClassificationEvaluator(labelCol="RiskIndex",predictionCol="prediction",metricName="accuracy")
cv=CrossValidator(estimator=dt,evaluator=myEvaluator,estimatorParamMaps=paramGrid, numFolds=3)
cvModel=cv.fit(processedDF)
finalDF=cvModel.transform(processedDF)
finalDF.show()

+-------+------+----+---------+----------------+-------------+-------------+----------+
|Savings|Income|Risk|RiskIndex|        features|rawPrediction|  probability|prediction|
+-------+------+----+---------+----------------+-------------+-------------+----------+
|  15000|  1100| Low|      1.0|[15000.0,1100.0]|[0.0,4.0,0.0]|[0.0,1.0,0.0]|       1.0|
|      0|  5000|High|      0.0|    [0.0,5000.0]|[2.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|  20000|   800|High|      0.0| [20000.0,800.0]|[3.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|   6000|  1300| Low|      1.0| [6000.0,1300.0]|[0.0,4.0,0.0]|[0.0,1.0,0.0]|       1.0|
|  50000|  2500| Low|      1.0|[50000.0,2500.0]|[0.0,4.0,0.0]|[0.0,1.0,0.0]|       1.0|
|   2000|  1100| Low|      1.0| [2000.0,1100.0]|[0.0,4.0,0.0]|[0.0,1.0,0.0]|       1.0|
|    700|  1500|High|      0.0|  [700.0,1500.0]|[2.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|  75000|     0|High|      0.0|   [75000.0,0.0]|[3.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|   4000|   500|High|      0.0| 

In [30]:
import numpy
cvModel.getEstimatorParamMaps()[numpy.argmax(cvModel.avgMetrics)]

{Param(parent='DecisionTreeClassifier_5f0f20325631', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 2,
 Param(parent='DecisionTreeClassifier_5f0f20325631', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'Gini'}