In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.ml.linalg import VectorUDT
from pyspark.sql.functions import split
from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.functions import pandas_udf
from pyspark.sql.functions import PandasUDFType
from pyspark.sql.types import IntegerType

import pandas as pd
import numpy as np
import io
from PIL import Image

import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array

In [2]:
# On crée notre environnement Spark
spark = SparkSession.builder.appName('P8_OCR_VLE').getOrCreate()

In [3]:
# # On règle quelques paramètres de configuration
# spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'true')
# spark.conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', '1024')

In [4]:
# On vérifie
spark

In [5]:
# On crée notre dataframe
images_dir = 'P8_data_sample/Data'
df_pyspark = spark.read.format('binaryFile').option('recursiveFileLookup', 'true')\
            .load(images_dir)

In [6]:
# On regarde nos colonnes et leur type
df_pyspark

DataFrame[path: string, modificationTime: timestamp, length: bigint, content: binary]

In [7]:
# Schéma de notre dataframe
df_pyspark.printSchema()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)



In [8]:
# Nos premières lignes
df_pyspark.show(5)

+--------------------+--------------------+------+--------------------+
|                path|    modificationTime|length|             content|
+--------------------+--------------------+------+--------------------+
|file:/C:/Users/vi...|2022-07-21 15:02:...| 38050|[FF D8 FF E0 00 1...|
|file:/C:/Users/vi...|2022-07-21 15:02:...| 37757|[FF D8 FF E0 00 1...|
|file:/C:/Users/vi...|2022-07-21 15:03:...| 33241|[FF D8 FF E0 00 1...|
|file:/C:/Users/vi...|2022-07-21 15:03:...| 33175|[FF D8 FF E0 00 1...|
|file:/C:/Users/vi...|2022-07-21 15:01:...| 16421|[FF D8 FF E0 00 1...|
+--------------------+--------------------+------+--------------------+
only showing top 5 rows



In [9]:
# On crée une colonne pour avoir nos labels et une colonne avec les labels encodés
df_pyspark = df_pyspark.withColumn('label', split(col('path'), '/').getItem(6))

# On va utiliser un StringIndexer
stringIndexer = StringIndexer(inputCol='label', outputCol='label_encoded')
sI = stringIndexer.fit(df_pyspark)

# On encode et on convertit nos labels en Integer par soucis de lisibilité
image_df = sI.transform(df_pyspark)
image_df = image_df.withColumn('label_encoded', col('label_encoded').cast(IntegerType()))

# On ne garde que quelques colonnes
image_df = image_df.select('path', 'content', 'label', 'label_encoded')
image_df.show()

+--------------------+--------------------+------------------+-------------+
|                path|             content|             label|label_encoded|
+--------------------+--------------------+------------------+-------------+
|file:/C:/Users/vi...|[FF D8 FF E0 00 1...| apple_pink_lady_1|            1|
|file:/C:/Users/vi...|[FF D8 FF E0 00 1...| apple_pink_lady_1|            1|
|file:/C:/Users/vi...|[FF D8 FF E0 00 1...|apple_red_yellow_1|            2|
|file:/C:/Users/vi...|[FF D8 FF E0 00 1...|apple_red_yellow_1|            2|
|file:/C:/Users/vi...|[FF D8 FF E0 00 1...|           apple_6|            0|
|file:/C:/Users/vi...|[FF D8 FF E0 00 1...|           apple_6|            0|
+--------------------+--------------------+------------------+-------------+



## ResNet50

In [10]:
model = ResNet50(include_top=False)

In [11]:
bc_model_weights = spark.sparkContext.broadcast(model.get_weights()) 

In [12]:
def model_fn():
    """
    Returns a ResNet50 model with top layer removed and broadcasted pretrained weights.
    """
    model = ResNet50(weights=None, include_top=False)
    model.set_weights(bc_model_weights.value)
    return model

In [13]:
def preprocess(content):
    """
    Preprocesses raw image bytes for prediction.
    """
    img = Image.open(io.BytesIO(content)).resize([224, 224])
    arr = img_to_array(img)
    return preprocess_input(arr)

In [14]:
def featurize_series(model, content_series):
    
    """
    Featurize a pd.Series of raw images using the input model.
    :return: a pd.Series of image features
    """
    
    input = np.stack(content_series.map(preprocess)) 
    preds = model.predict(input)
    output = [p.flatten() for p in preds]
    return pd.Series(output)

In [15]:
@pandas_udf('array<float>', PandasUDFType.SCALAR_ITER)
def featurize_udf(content_series_iter):
    """
    This method is a Scalar Iterator pandas UDF wrapping our featurization function.
    The decorator specifies that this returns a Spark DataFrame column of type ArrayType(FloatType).
  
    :param content_series_iter: This argument is an iterator over batches of data, where each batch
                                is a pandas Series of image data.
    """
    model = model_fn()
    for content_series in content_series_iter:
        yield featurize_series(model, content_series)



In [16]:
features_df = image_df.select(col('path'), col('label'), featurize_udf('content').alias('features'))

In [17]:
features_df.printSchema()

root
 |-- path: string (nullable = true)
 |-- label: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)



In [18]:
features_df.show()

+--------------------+------------------+--------------------+
|                path|             label|            features|
+--------------------+------------------+--------------------+
|file:/C:/Users/vi...| apple_pink_lady_1|[0.0, 0.0, 0.0, 0...|
|file:/C:/Users/vi...| apple_pink_lady_1|[0.0, 0.0, 0.0, 0...|
|file:/C:/Users/vi...|apple_red_yellow_1|[0.0, 0.0, 0.0, 0...|
|file:/C:/Users/vi...|apple_red_yellow_1|[0.0, 0.0, 0.0, 0...|
|file:/C:/Users/vi...|           apple_6|[0.0, 0.0, 0.0, 0...|
|file:/C:/Users/vi...|           apple_6|[0.0, 0.0, 0.0, 0...|
+--------------------+------------------+--------------------+



## Scaling des données

In [19]:
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
features_df = features_df.select(col('path'),  col('label'), list_to_vector_udf(features_df['features']).alias('features'))

In [20]:
standardizer = StandardScaler(withMean=True, withStd=True,
                              inputCol='features',
                              outputCol='feats_scaled')
std = standardizer.fit(features_df)
features_df_scaled = std.transform(features_df)

In [21]:
features_df_scaled.printSchema()
features_df_scaled.show() 

root
 |-- path: string (nullable = true)
 |-- label: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- feats_scaled: vector (nullable = true)

+--------------------+------------------+--------------------+--------------------+
|                path|             label|            features|        feats_scaled|
+--------------------+------------------+--------------------+--------------------+
|file:/C:/Users/vi...| apple_pink_lady_1|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|file:/C:/Users/vi...| apple_pink_lady_1|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|file:/C:/Users/vi...|apple_red_yellow_1|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|file:/C:/Users/vi...|apple_red_yellow_1|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|file:/C:/Users/vi...|           apple_6|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|file:/C:/Users/vi...|           apple_6|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
+--------------------+------------------+--------------------+-------------------

In [25]:
test = features_df_scaled.select('feats_scaled').collect()[0]

In [26]:
num_values = len(set(test[0]))
print(num_values)

18597


In [27]:
values = set(test[0])

In [28]:
print(max(values))

2.0412414523193156


## PCA

In [22]:
pca = PCA(k=8, inputCol='feats_scaled', outputCol='pca')
modelpca = pca.fit(features_df_scaled)
transformed = modelpca.transform(features_df_scaled)

In [23]:
transformed.printSchema()

root
 |-- path: string (nullable = true)
 |-- label: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- feats_scaled: vector (nullable = true)
 |-- pca: vector (nullable = true)



In [24]:
# 8
variance_explained = modelpca.explainedVariance
variance_explained

DenseVector([0.4092, 0.3185, 0.1235, 0.0825, 0.0663])

## Enregistrement des résultats

In [41]:
features_df_scaled.write.parquet(path='C:/Users/victo/P8_data_sample/Features', mode='overwrite')