# 0 : Paths definition

We define 4 paths :
- The execution Path
- The data path
- The path to initial results after feature extraction and PCA (determined via sklearns PCA) : 95% explained variance on smaller sample to determine k (138)


In [9]:
PATH = "s3://ds-p8"
PATH_Data = PATH + "/data"
PATH_Result = PATH + "/Results"

k_components = 138

print(f"PATH = {PATH}")
print(f"PATH DATA = {PATH_Data}")
print(f"PATH Result = {PATH_Result}")


PATH = s3://ds-p8
PATH DATA = s3://ds-p8/data
PATH Result = s3://ds-p8/Results


In [10]:
import pandas as pd
import numpy as np
import io
import tensorflow as tf

from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras import Model

from PIL import Image
from pyspark.sql.functions import col, pandas_udf, element_at, split
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType
from pyspark.ml.feature import PCA as spa_PCA
from pyspark.ml.linalg import Vectors, VectorUDT, SparseVector, DenseVector



In [11]:
spark = (
    SparkSession.builder
    .appName("ds_p8")
    .config("spark.sql.parquet.writeLegacyFormat", "true")
    .getOrCreate()
)

sc = spark.sparkContext


In [None]:
images = (
    spark.read.format("binaryFile")
    .option("pathGlobFilter", "*.jpg")
    .option("recursiveFileLookup", "true")
    .load(PATH_Data)
)


<u>Keeping image paths and assigning labels as containing folder (split before /)</u> :

In [None]:
images = images.withColumn("label", element_at(split(images["path"], "/"), -2))

print(images.printSchema())

print(images.select("path", "label").show(5, False))


# 1 : Model Setup :
- Using MobileNetV2

In [12]:
model = MobileNetV2(
    weights="imagenet",
    include_top=True,
    input_shape=(224, 224, 3)
    )


In [13]:
new_model = Model(
    inputs=model.input,
    outputs=model.layers[-2].output
    )


In [14]:
# Broadcasting the weights to the workers :

brodcast_weights = sc.broadcast(new_model.get_weights())


In [15]:
new_model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 Conv1 (Conv2D)                 (None, 112, 112, 32  864         ['input_2[0][0]']                
                                )                                                                 
                                                                                                  
 bn_Conv1 (BatchNormalization)  (None, 112, 112, 32  128         ['Conv1[0][0]']                  
                                )                                                           

In [16]:
def model_fn():
    """
    Returns a MobileNetV2 model with top layer removed 
    and broadcasted pretrained weights.
    """
    model = MobileNetV2(
        weights="imagenet",
        include_top=True,
        input_shape=(224, 224, 3)
        )

    for layer in model.layers:
        layer.trainable = False

    new_model = tf.kerras.Model(
        inputs=model.input,
        outputs=model.layers[-2].output
        )

    new_model.set_weights(brodcast_weights.value)
    return new_model


def preprocess(content):
    """
    Preprocesses raw image bytes for prediction.
    """
    img = Image.open(io.BytesIO(content)).resize([224, 224])
    arr = img_to_array(img)
    return preprocess_input(arr)


def featurize_series(model, content_series):
    """
    Featurize a pd.Series of raw images using the input model.
    :return: a pd.Series of image features
    """
    input = np.stack(content_series.map(preprocess))
    preds = model.predict(input)
    # For some layers, output features will be multi-dimensional tensors.
    # We flatten the feature tensors to vectors for easier storage in Spark DataFrames.
    output = [p.flatten() for p in preds]
    return pd.Series(output)


@pandas_udf(ArrayType(FloatType()))
def featurize_udf(content_series_iter):
    """
    This method is a Scalar Iterator pandas UDF wrapping our featurization function.
    The decorator specifies that this returns a Spark DataFrame column of type ArrayType(FloatType).

    :param content_series_iter: This argument is an iterator over batches of data, where each batch
                              is a pandas Series of image data.
    """
    # With Scalar Iterator pandas UDFs, we can load the model once and then re-use it
    # for multiple data batches.  This amortizes the overhead of loading big models.

    model = model_fn()

    for content_series in content_series_iter:
        yield featurize_series(model, content_series)



# 2 : Feature extraction :

In [None]:
features_df = images.repartition(24).select(
    col("path"),
    col("label"),
    featurize_udf("content").alias("features")
    )


In [None]:
features_df.show(n=10)


# 3 : PCA over extracted feature
- using PySpark's PCA since Sklearn's method doesnt support distributed computing
- predetermined k_components value via local Sklearns PCA on around 350 images at 138 components for 95% explained variance


We will need to convert array to vector to perform the pca, and then vector to array to be readable as array (and not dict) in Python

In [None]:
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())

df_spark_vector = features_df.select(
    features_df["path"],
    features_df["label"], 
    list_to_vector_udf(features_df["features"]).alias("features")
)


In [None]:
df_spark_vector.printSchema()


In [None]:
spark_pca = spa_PCA(k=k_components, inputCol="features")
spark_pca.setOutputCol("reduced_features")


In [None]:
spark_pca_model = spark_pca.fit(df_spark_vector)
print(spark_pca_model.getK())


In [None]:
df_spark_vector = spark_pca_model.transform(dataset=df_spark_vector)


# Converting vectors back to arrays to be readable in python as such and not dict, saving as parquet

In [None]:
vector_to_array_udf = udf(
    lambda vector: vector.toArray().tolist() 
                  if isinstance(vector, (DenseVector, SparseVector)) 
                  else vector, 
    ArrayType(FloatType())
)

df_spark_vector = df_spark_vector.withColumn("features", vector_to_array_udf(df_spark_vector["features"]))
df_spark_vector = df_spark_vector.withColumn("reduced_features", vector_to_array_udf(df_spark_vector["reduced_features"]))


In [None]:
df_spark_vector.write.mode("overwrite").parquet(PATH_Result)


# Checking correct save format :

In [None]:
df = pd.read_parquet(PATH_Result, engine="pyarrow")


In [None]:
df.head()


In [None]:
df.loc[0, "features"].shape


In [None]:
df.loc[0, "reduced_features"].shape


Shows the initial shape of the features and the reduced features via PCA