In [45]:
# (1) Import the required Python dependencies
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [46]:
# (2) Instantiate a Spark Context
conf = SparkConf().setMaster("spark://192.168.56.10:7077").setAppName("Multilayer Perceptron - OCR")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [47]:
# (3) Load the Letter Recognition Dataset (in CSV format with pre-defined label and features columns)
# (3.1) Create Feature Vectors from the 16 features
# (3.2) Rename the 'lettr' column to 'label' which is a number representing one of the 26 characters in the English alphabet

letter_recognition_df = sqlContext.read.format('com.databricks.spark.csv').options(header = 'true', inferschema = 'true').load('/data/workspaces/jillur.quddus/jupyter/notebooks/Machine-Learning-with-Apache-Spark-QuickStart-Guide/chapter07/data/ocr-data/letter-recognition.csv')
feature_columns = ['x-box','y-box','width','high','onpix','x-bar','y-bar','x2bar','y2bar','xybar','x2ybr','xy2br','x-ege','xegvy','y-ege','yegvx']
vector_assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'features')
vectorised_df = vector_assembler.transform(letter_recognition_df).withColumnRenamed('lettr', 'label').select('label', 'features')
vectorised_df.show(10, False)

+-----+----------------------------------------------------------------------+
|label|features                                                              |
+-----+----------------------------------------------------------------------+
|19   |[2.0,8.0,3.0,5.0,1.0,8.0,13.0,0.0,6.0,6.0,10.0,8.0,0.0,8.0,0.0,8.0]   |
|8    |[5.0,12.0,3.0,7.0,2.0,10.0,5.0,5.0,4.0,13.0,3.0,9.0,2.0,8.0,4.0,10.0] |
|3    |[4.0,11.0,6.0,8.0,6.0,10.0,6.0,2.0,6.0,10.0,3.0,7.0,3.0,7.0,3.0,9.0]  |
|13   |[7.0,11.0,6.0,6.0,3.0,5.0,9.0,4.0,6.0,4.0,4.0,10.0,6.0,10.0,2.0,8.0]  |
|6    |[2.0,1.0,3.0,1.0,1.0,8.0,6.0,6.0,6.0,6.0,5.0,9.0,1.0,7.0,5.0,10.0]    |
|18   |[4.0,11.0,5.0,8.0,3.0,8.0,8.0,6.0,9.0,5.0,6.0,6.0,0.0,8.0,9.0,7.0]    |
|1    |[4.0,2.0,5.0,4.0,4.0,8.0,7.0,6.0,6.0,7.0,6.0,6.0,2.0,8.0,7.0,10.0]    |
|0    |[1.0,1.0,3.0,2.0,1.0,8.0,2.0,2.0,2.0,8.0,2.0,8.0,1.0,6.0,2.0,7.0]     |
|9    |[2.0,2.0,4.0,4.0,2.0,10.0,6.0,2.0,6.0,12.0,4.0,8.0,1.0,6.0,1.0,7.0]   |
|12   |[11.0,15.0,13.0,9.0,7.0,13.0,2.0,6.0,2.0,12.0

In [48]:
# (4) Split the Featurised DataFrame into a Training DataFrame and a Test DataFrame
train_df, test_df = vectorised_df.randomSplit([0.75, 0.25], seed=12345)
train_df.count(), test_df.count()

(14927, 5073)

In [49]:
# (5) Specify the layers for our Neural Network
# (5.1) The 1st element in this list represents the size of the Input Layer. In our case, we have 16 features
# (5.2) The next elements in the list represent the sizes of the intermediate Hidden Layers, in our case 8 and 4
# (5.3) The final element in this list represents the size of the Output. In our case, we have 26 classes
layers = [16, 8, 4, 26]

In [50]:
# (6) Train a Multilayer Perceptron Classifier using our list representing our layers from input to output layers
multilayer_perceptron_classifier = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
multilayer_perceptron_classifier_model = multilayer_perceptron_classifier.fit(train_df)

In [51]:
# (7) Apply the Trained Multilayer Perceptron Classifier Model to the Test DataFrame to make predictions
test_predictions_df = multilayer_perceptron_classifier_model.transform(test_df)
print("TEST DATASET PREDICTIONS AGAINST ACTUAL LABEL: ")
test_predictions_df.select("label", "features", "probability", "prediction").show()

TEST DATASET PREDICTIONS AGAINST ACTUAL LABEL: 
+-----+--------------------+--------------------+----------+
|label|            features|         probability|prediction|
+-----+--------------------+--------------------+----------+
|    0|[1.0,0.0,2.0,0.0,...|[0.62605849526384...|       0.0|
|    0|[1.0,0.0,2.0,0.0,...|[0.62875656935176...|       0.0|
|    0|[1.0,0.0,2.0,0.0,...|[0.62875656935176...|       0.0|
|    0|[1.0,0.0,2.0,0.0,...|[0.62836652229708...|       0.0|
|    0|[1.0,0.0,2.0,0.0,...|[0.62875739589563...|       0.0|
|    0|[1.0,0.0,2.0,0.0,...|[0.62875739589563...|       0.0|
|    0|[1.0,1.0,2.0,2.0,...|[0.61675544434183...|       0.0|
|    0|[1.0,3.0,2.0,1.0,...|[0.62709338540423...|       0.0|
|    0|[1.0,3.0,2.0,1.0,...|[0.62873649217115...|       0.0|
|    0|[1.0,3.0,2.0,2.0,...|[0.62874868432571...|       0.0|
|    0|[1.0,3.0,3.0,2.0,...|[0.62144842233237...|       0.0|
|    0|[2.0,0.0,3.0,1.0,...|[0.62875662456358...|       0.0|
|    0|[2.0,1.0,3.0,2.0,...|[0.626443

In [52]:
# (8) Compute the accuracy of our Trained Multilayer Perceptron Classifier Model on the Test DataFrame
prediction_and_labels = test_predictions_df.select("prediction", "label")
accuracy_evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(metricName="weightedRecall")
print("Accuracy on Test Dataset = %g" % accuracy_evaluator.evaluate(prediction_and_labels))
print("Precision on Test Dataset = %g" % precision_evaluator.evaluate(prediction_and_labels))
print("Recall on Test Dataset = %g" % recall_evaluator.evaluate(prediction_and_labels))

Accuracy on Test Dataset = 0.339641
Precision on Test Dataset = 0.313333
Recall on Test Dataset = 0.339641


In [53]:
# (9) To improve the accuracy of our model, let us increase the size of the Hidden Layers
new_layers = [16, 16, 12, 26]
new_multilayer_perceptron_classifier = MultilayerPerceptronClassifier(maxIter=400, layers=new_layers, blockSize=128, seed=1234)
new_multilayer_perceptron_classifier_model = new_multilayer_perceptron_classifier.fit(train_df)
new_test_predictions_df = new_multilayer_perceptron_classifier_model.transform(test_df)
print("New Accuracy on Test Dataset = %g" % accuracy_evaluator.evaluate(new_test_predictions_df.select("prediction", "label")))

New Accuracy on Test Dataset = 0.71575


In [54]:
# (10) Stop the Spark Context
sc.stop()