In [1]:
spark = SparkSession.builder \
   .master("local") \
   .appName("ImageClassification") \
   .config("spark.executor.memory", "6gb") \
   .getOrCreate()

In [2]:
import pyspark.sql.functions as f
import sparkdl as dl

Using TensorFlow backend.
  return f(*args, **kwds)


In [3]:
dfMessi = dl.readImages('football/messi/').withColumn('label', f.lit(0))
dfRonaldo = dl.readImages('football/ronaldo/').withColumn('label', f.lit(1))

In [4]:
dfMessi.show(n=10,truncate=False)

+---------------------------------------------------------------------+---------------------------+-----+
|filePath                                                             |image                      |label|
+---------------------------------------------------------------------+---------------------------+-----+
|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi3.jpeg |[RGB,173,292,3,[B@43647d0f]|0    |
|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi14.jpeg|[RGB,187,270,3,[B@28fe803] |0    |
|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi29.jpeg|[RGB,194,259,3,[B@669635ee]|0    |
|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi18.jpeg|[RGB,194,259,3,[B@6e004f55]|0    |
|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi8.jpeg |[RGB,168,300,3,[B@eecdb9f] |0    |
|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi22.jpeg|[RGB,194,259,3,[B@73def5b1]|0    |
|file:/home/asherif844/sparkNotebooks/Ch14/foo

In [5]:
dfRonaldo.show(n=10,truncate=False)

+-------------------------------------------------------------------------+---------------------------+-----+
|filePath                                                                 |image                      |label|
+-------------------------------------------------------------------------+---------------------------+-----+
|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo24.jpg |[RGB,350,590,3,[B@7b3b3c6] |1    |
|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo2.jpeg |[RGB,225,225,3,[B@61826869]|1    |
|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo21.jpeg|[RGB,193,261,3,[B@1d739c7f]|1    |
|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo17.jpeg|[RGB,183,275,3,[B@59b36a5b]|1    |
|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo30.jpeg|[RGB,184,273,3,[B@4304cf28]|1    |
|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo14.jpeg|[RGB,154,328,3,[B@31b73601]|1    |
|file:/hom

In [6]:
trainDFmessi, testDFmessi = dfMessi.randomSplit([66.7, 33.3], seed =12)
trainDFronaldo, testDFronaldo = dfRonaldo.randomSplit([66.7, 33.3], seed=12)

In [7]:
print('The number of images in trainDFmessi is {}'.format(trainDFmessi.toPandas().shape[0]))
print('The number of images in testDFmessi is {}'.format(testDFmessi.toPandas().shape[0]))
print('The number of images in trainDFronaldo is {}'.format(trainDFronaldo.toPandas().shape[0]))
print('The number of images in testDFronaldo is {}'.format(testDFronaldo.toPandas().shape[0]))

The number of images in trainDFmessi is 18
The number of images in testDFmessi is 12
The number of images in trainDFronaldo is 18
The number of images in testDFronaldo is 12


In [8]:
trainDF = trainDFmessi.unionAll(trainDFronaldo)
testDF = testDFmessi.unionAll(testDFronaldo)

In [9]:
print('The number of images in the training data is {}' .format(trainDF.toPandas().shape[0]))
print('The number of images in the testing  data is {}' .format(testDF.toPandas().shape[0]))

The number of images in the training data is 36
The number of images in the testing  data is 24


In [10]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

vectorizer = dl.DeepImageFeaturizer(inputCol="image", outputCol="features", modelName='InceptionV3')
logreg = LogisticRegression(maxIter=30,labelCol = "label", featuresCol="features")
pipeline = Pipeline(stages=[vectorizer, logreg])

pipeline_model = pipeline.fit(trainDF)

INFO:tensorflow:Froze 376 variables.
Converted 376 variables to const ops.
INFO:tensorflow:Froze 0 variables.
Converted 0 variables to const ops.


In [11]:
predictDF = pipeline_model.transform(testDF)
predictDF.select('label', 'prediction').show(n = testDF.toPandas().shape[0], truncate=False)

INFO:tensorflow:Froze 376 variables.
Converted 376 variables to const ops.
INFO:tensorflow:Froze 0 variables.
Converted 0 variables to const ops.
+-----+----------+
|label|prediction|
+-----+----------+
|0    |1.0       |
|0    |0.0       |
|0    |0.0       |
|0    |0.0       |
|0    |0.0       |
|0    |0.0       |
|0    |0.0       |
|0    |1.0       |
|0    |0.0       |
|0    |0.0       |
|0    |0.0       |
|0    |0.0       |
|1    |1.0       |
|1    |1.0       |
|1    |1.0       |
|1    |1.0       |
|1    |1.0       |
|1    |0.0       |
|1    |1.0       |
|1    |1.0       |
|1    |1.0       |
|1    |1.0       |
|1    |1.0       |
|1    |1.0       |
+-----+----------+



In [12]:
predictDF.crosstab('prediction', 'label').show()

+----------------+---+---+
|prediction_label|  0|  1|
+----------------+---+---+
|             1.0|  2| 11|
|             0.0| 10|  1|
+----------------+---+---+



In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
scoring = predictDF.select("prediction", "label")
accuracy_score = MulticlassClassificationEvaluator(metricName="accuracy")
rate = accuracy_score.evaluate(scoring)*100
print("accuracy: {}%" .format(round(rate,2)))

accuracy: 87.5%


In [14]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

binaryevaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
binary_rate = binaryevaluator.evaluate(predictDF)*100
print("accuracy: {}%" .format(round(binary_rate,2)))

accuracy: 87.5%


In [15]:
logregFT = LogisticRegression(
    regParam=0.05, 
    elasticNetParam=0.3,
    maxIter=15,labelCol = "label", featuresCol="features")
pipelineFT = Pipeline(stages=[vectorizer, logregFT])

pipeline_model_FT = pipelineFT.fit(trainDF)

INFO:tensorflow:Froze 376 variables.
Converted 376 variables to const ops.
INFO:tensorflow:Froze 0 variables.
Converted 0 variables to const ops.


In [16]:
predictDF_FT = pipeline_model_FT.transform(testDF)
predictDF_FT.crosstab('prediction', 'label').show()

INFO:tensorflow:Froze 376 variables.
Converted 376 variables to const ops.
INFO:tensorflow:Froze 0 variables.
Converted 0 variables to const ops.
+----------------+---+---+
|prediction_label|  0|  1|
+----------------+---+---+
|             1.0|  0| 11|
|             0.0| 12|  1|
+----------------+---+---+



In [17]:
binary_rate_FT = binaryevaluator.evaluate(predictDF_FT)*100
print("accuracy: {}%" .format(round(binary_rate_FT,2)))

accuracy: 95.83%
