In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 34 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 41.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=dac7f16200e1e6bb75b15b0693e1683efc358cabcf7bbab385d752a3cbbe400c
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [2]:
from pyspark.sql import SparkSession

In [53]:
session = SparkSession.builder.appName("multiclass").master("local").getOrCreate()
data = session.read.csv("letter_recognition.csv", header = True, inferSchema = True)

In [54]:
data.show()

+------+-----+-----+-----+----+-----+-----+-----+------+------+-----+-----+-----+-----+-----+-----+-----+
|letter|x_box|y_box|width|high|onpix|x_bar|y_bar|x2_bar|y2_bar|xybar|x2ybr|xy2br|x_ege|xegvy|y_ege|yegvx|
+------+-----+-----+-----+----+-----+-----+-----+------+------+-----+-----+-----+-----+-----+-----+-----+
|     T|    2|    8|    3|   5|    1|    8|   13|     0|     6|    6|   10|    8|    0|    8|    0|    8|
|     I|    5|   12|    3|   7|    2|   10|    5|     5|     4|   13|    3|    9|    2|    8|    4|   10|
|     D|    4|   11|    6|   8|    6|   10|    6|     2|     6|   10|    3|    7|    3|    7|    3|    9|
|     N|    7|   11|    6|   6|    3|    5|    9|     4|     6|    4|    4|   10|    6|   10|    2|    8|
|     G|    2|    1|    3|   1|    1|    8|    6|     6|     6|    6|    5|    9|    1|    7|    5|   10|
|     S|    4|   11|    5|   8|    3|    8|    8|     6|     9|    5|    6|    6|    0|    8|    9|    7|
|     B|    4|    2|    5|   4|    4|    8|   

In [55]:
data.columns

['letter',
 'x_box',
 'y_box',
 'width',
 'high',
 'onpix',
 'x_bar',
 'y_bar',
 'x2_bar',
 'y2_bar',
 'xybar',
 'x2ybr',
 'xy2br',
 'x_ege',
 'xegvy',
 'y_ege',
 'yegvx']

In [56]:
data.dtypes

[('letter', 'string'),
 ('x_box', 'int'),
 ('y_box', 'int'),
 ('width', 'int'),
 ('high', 'int'),
 ('onpix', 'int'),
 ('x_bar', 'int'),
 ('y_bar', 'int'),
 ('x2_bar', 'int'),
 ('y2_bar', 'int'),
 ('xybar', 'int'),
 ('x2ybr', 'int'),
 ('xy2br', 'int'),
 ('x_ege', 'int'),
 ('xegvy', 'int'),
 ('y_ege', 'int'),
 ('yegvx', 'int')]

In [57]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
str_obj = StringIndexer(inputCol="letter", outputCol="newletter")

In [58]:
vec_ass = VectorAssembler(inputCols =['x_box','y_box','width','high','onpix','x_bar','y_bar','x2_bar','y2_bar','xybar','x2ybr','xy2br','x_ege','xegvy','y_ege','yegvx'], outputCol = "allfeatures")


# Decision Tree

In [59]:
from pyspark.ml.classification import DecisionTreeClassifier
tree = DecisionTreeClassifier(featuresCol ="allfeatures", labelCol = "newletter")

In [60]:
from pyspark.ml import Pipeline
mypipeline = Pipeline(stages =[str_obj, vec_ass, tree])

In [61]:
training,test=data.randomSplit([0.7,0.3])

In [62]:
treemodel= mypipeline.fit(training)

In [69]:
results = treemodel.transform(test)
results.show(5, truncate = False)

+------+-----+-----+-----+----+-----+-----+-----+------+------+-----+-----+-----+-----+-----+-----+-----+---------+-----------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|letter|x_box|y_box|width|high|onpix|x_bar|y_bar|x2_bar|y2_bar|xybar|x2ybr|xy2br|x_ege|xegvy|y_ege|yegvx|newletter|allfeatures                                                      |rawPrediction                                                                                              |probability                                                                                                                                               |prediction|
+------+-----+-----+-----+----+-----+-----+-----+------+------+-----+-----+-----+-----+-

In [79]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
eval = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol= "newletter", metricName = "weightedPrecision")

In [80]:
eval.evaluate(results)

0.3933476055666778

In [82]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
eval = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol= "newletter", metricName = "weightedRecall")
eval.evaluate(results)

0.36976047904191617

In [72]:
MulticlassClassificationEvaluator?

# Random Forest

In [99]:
from pyspark.ml.classification import RandomForestClassifier
Rtree = RandomForestClassifier(featuresCol ="allfeatures", labelCol = "newletter")

In [100]:
from pyspark.ml import Pipeline
mypipeline1 = Pipeline(stages =[str_obj, vec_ass, Rtree])

In [101]:
training,test=data.randomSplit([0.7,0.3])

In [102]:
treemodel= mypipeline1.fit(training)

In [103]:
results1 = treemodel.transform(test)
results1.show(5, truncate = False)

+------+-----+-----+-----+----+-----+-----+-----+------+------+-----+-----+-----+-----+-----+-----+-----+---------+-----------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [104]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
eval = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol= "newletter")
eval.evaluate(results1)

0.6139712441938769

In [105]:
evaluation = ["f1","accuracy","weightedPrecision","weightedRecall", "weightedTruePositiveRate", "weightedFalsePositiveRate", "weightedFMeasure", "truePositiveRateByLabel", "falsePositiveRateByLabel", "precisionByLabel","recallByLabel", "fMeasureByLabel", "logLoss","hammingLoss"]
for i in evaluation:
  from pyspark.ml.evaluation import MulticlassClassificationEvaluator
  eval = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol= "newletter", metricName=i)
  print(i, ":", eval.evaluate(results1))

f1 : 0.6139712441938769
accuracy : 0.6282333051563821
weightedPrecision : 0.6643579612053594
weightedRecall : 0.628233305156382
weightedTruePositiveRate : 0.628233305156382
weightedFalsePositiveRate : 0.01491465054918591
weightedFMeasure : 0.6139712441938769
truePositiveRateByLabel : 0.8181818181818182
falsePositiveRateByLabel : 0.016999649491763056
precisionByLabel : 0.6380597014925373
recallByLabel : 0.8181818181818182
fMeasureByLabel : 0.7169811320754718
logLoss : 1.84494864011004
hammingLoss : 0.3717666948436179


In [94]:
# evaluation 
# (f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate| weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel| falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel| logLoss|hammingLoss)')¶