# Local execution :

In [1]:
import os
import io

import pandas as pd
import numpy as np
import tensorflow as tf

from PIL import Image
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras import Model
from sklearn.decomposition import PCA as skl_PCA
from pyspark.sql.functions import col, pandas_udf, PandasUDFType, element_at, split
from pyspark.sql.types import ArrayType, FloatType
from pyspark.sql import SparkSession
from pyspark.ml.feature import PCA as spa_PCA
from pyspark.ml.linalg import Vectors, VectorUDT, SparseVector, DenseVector
from pyspark.sql.functions import udf


2023-05-31 16:53:45.743232: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-31 16:53:45.773880: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-31 16:53:45.774452: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
PATH = os.getcwd()
PATH_DATA = PATH + "/data/Test1"
PATH_RESULT = PATH + "/data/Results"
print("PATH:" + PATH + "\nPATH_Data:" + PATH_DATA + "\nPATH_Result: " + PATH_RESULT)


PATH:/home/pierre/git/ds_p8
PATH_Data:/home/pierre/git/ds_p8/data/Test1
PATH_Result: /home/pierre/git/ds_p8/data/Results


# Creating Session

In [3]:
spark = (
    SparkSession.builder
    .appName("ds_p8")
    .master('local')
    .config("spark.sql.parquet.writeLegacyFormat", "true")
    .getOrCreate()
)


23/05/31 16:53:48 WARN Utils: Your hostname, Pierre-desktop resolves to a loopback address: 127.0.1.1; using 192.168.1.45 instead (on interface eno1)
23/05/31 16:53:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/31 16:53:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Spark context creation

In [4]:
sc = spark.sparkContext

spark


# Loading data :

In [5]:
images = (
    spark.read.format("binaryFile")
    .option("pathGlobFilter", "*.jpg")
    .option("recursiveFileLookup", "true")
    .load(PATH_DATA)
)


                                                                                

In [6]:
images = images.withColumn('label', element_at(split(images['path'], '/'),-2))
print(images.printSchema())
print(images.select('path','label').show(5,False))


root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- label: string (nullable = true)

None
+--------------------------------------------------------------+----------+
|path                                                          |label     |
+--------------------------------------------------------------+----------+
|file:/home/pierre/git/ds_p8/data/Test1/Watermelon/r_87_100.jpg|Watermelon|
|file:/home/pierre/git/ds_p8/data/Test1/Watermelon/286_100.jpg |Watermelon|
|file:/home/pierre/git/ds_p8/data/Test1/Watermelon/281_100.jpg |Watermelon|
|file:/home/pierre/git/ds_p8/data/Test1/Watermelon/139_100.jpg |Watermelon|
|file:/home/pierre/git/ds_p8/data/Test1/Watermelon/205_100.jpg |Watermelon|
+--------------------------------------------------------------+----------+
only showing top 5 rows

None


# Model creation

In [7]:
model = MobileNetV2(
    weights='imagenet',
    include_top=True,
    input_shape=(224, 224, 3)
    )


2023-05-31 16:53:52.875469: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-05-31 16:53:52.875845: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [8]:
new_model = Model(
    inputs=model.input,
    outputs=model.layers[-2].output
    )


In [9]:
new_model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 Conv1 (Conv2D)                 (None, 112, 112, 32  864         ['input_1[0][0]']                
                                )                                                                 
                                                                                                  
 bn_Conv1 (BatchNormalization)  (None, 112, 112, 32  128         ['Conv1[0][0]']                  
                                )                                                             

In [10]:
# Broadcasting the weights to the workers : (useless here)

brodcast_weights = sc.broadcast(new_model.get_weights())


In [11]:
def model_fn():
    """
    Returns a MobileNetV2 model with top layer removed 
    and broadcasted pretrained weights.
    """
    model = MobileNetV2(
        weights='imagenet',
        include_top=True,
        input_shape=(224, 224, 3)
        )

    for layer in model.layers:
        layer.trainable = False

    new_model = Model(
        inputs=model.input,
        outputs=model.layers[-2].output
        )

    new_model.set_weights(brodcast_weights.value)
    return new_model



In [12]:
def preprocess(content):
    """
    Preprocesses raw image bytes for prediction.
    """
    img = Image.open(io.BytesIO(content)).resize([224, 224])
    arr = img_to_array(img)
    return preprocess_input(arr)


def featurize_series(model, content_series):
    """
    Featurize a pd.Series of raw images using the input model.

    Returns
    - pd.Series of image features
    """
    input = np.stack(content_series.map(preprocess))
    preds = model.predict(input)
    # For some layers, output features will be multi-dimensional tensors.
    # We flatten the feature tensors to vectors for easier storage in Spark DataFrames.
    output = [p.flatten() for p in preds]
    return pd.Series(output)


@pandas_udf('array<float>', PandasUDFType.SCALAR_ITER)
def featurize_udf(content_series_iter):
    '''
    This method is a Scalar Iterator pandas UDF wrapping our featurization function.
    The decorator specifies that this returns a Spark DataFrame column of type ArrayType(FloatType).

    Args:
    - content_series_iter: This argument is an iterator over batches of data, where each batch
    is a pandas Series of image data.
    '''
    # With Scalar Iterator pandas UDFs, we can load the model once and then re-use it
    # for multiple data batches.  This amortizes the overhead of loading big models.
    model = model_fn()
    for content_series in content_series_iter:
        yield featurize_series(model, content_series)





# Feature extraction

In [13]:
features_df = images.repartition(20).select(
    col("path"),
    col("label"),
    featurize_udf("content").alias("features")
    )


In [14]:
features_df.show(n=10)


2023-05-31 16:53:57.136639: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


+--------------------+----------+--------------------+
|                path|     label|            features|
+--------------------+----------+--------------------+
|file:/home/pierre...|Watermelon|[0.029508425, 0.0...|
|file:/home/pierre...| Raspberry|[0.71335524, 1.35...|
|file:/home/pierre...|Cantaloupe|[0.027424365, 0.0...|
|file:/home/pierre...|Clementine|[0.113987595, 0.0...|
|file:/home/pierre...| Mangostan|[0.7677538, 0.0, ...|
|file:/home/pierre...|      Kiwi|[0.25983402, 0.0,...|
|file:/home/pierre...|     Melon|[1.4450728, 0.0, ...|
|file:/home/pierre...|   Avocado|[0.19016302, 0.00...|
|file:/home/pierre...| Carambula|[1.5248429, 1.399...|
|file:/home/pierre...|      Pear|[0.20270693, 0.0,...|
+--------------------+----------+--------------------+
only showing top 10 rows



                                                                                

# PCA Over the features :

## Spark's PCA vs Sklearn's PCA : 
- While Sklearn's option is better on small/non distributed workloads because it can be more precisely tailored, in a context of distributed computing, the sklearn's PCA would need workarounds like Ray or Dask
- Spark's MLlib features a PCA option which, while not as customizable as its Sklearn's counterpart, is compatible by design with the formats and nature of distributed computing

--> [Source](https://towardsdatascience.com/apache-spark-mllib-vs-scikit-learn-building-machine-learning-pipelines-be49ecc69a82) (Medium) , partial explanation not technically on PCA


### However :

- k, the number of principal components, needs to be specified as input to spark's PCA - one way to do this is to guesstimate it (25% the number of features for example) - another way is to use sklearn's PCA to choose the amount of explained variance expected (let's say 80%+) - performing the PCA locally on a small sample and have a pretty good idea of k

In [15]:
feature_df_pd = features_df.toPandas()


2023-05-31 16:54:00.425504: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
                                                                                

### Finding the optimal k (number of components) with Sklearn :

- lets aim for 95% explained variance, we can adjust the variable value if results are not satisfactory


In [16]:
feature_df_pd.head()


Unnamed: 0,path,label,features
0,file:/home/pierre/git/ds_p8/data/Test1/Waterme...,Watermelon,"[0.029508424922823906, 0.06816273927688599, 0...."
1,file:/home/pierre/git/ds_p8/data/Test1/Raspber...,Raspberry,"[0.7133552432060242, 1.3564988374710083, 0.0, ..."
2,file:/home/pierre/git/ds_p8/data/Test1/Cantalo...,Cantaloupe,"[0.027424365282058716, 0.0, 0.0, 0.0, 0.0, 0.0..."
3,file:/home/pierre/git/ds_p8/data/Test1/Clement...,Clementine,"[0.11398759484291077, 0.0, 0.0, 0.0, 0.0240882..."
4,file:/home/pierre/git/ds_p8/data/Test1/Mangost...,Mangostan,"[0.7677537798881531, 0.0, 0.0, 0.0, 0.10631377..."


In [17]:
explained_variance = 0.95  # if a float < 1 is passed to n_components of sklearn's pca, it will default to explained variance

sk_pca = skl_PCA(n_components=explained_variance)

reduced_features_values = sk_pca.fit_transform(np.stack(feature_df_pd["features"].values))


In [18]:
print("original array size :", np.stack(feature_df_pd["features"].values).shape[1])
print("reduced array size :", reduced_features_values.shape[1])


original array size : 1280
reduced array size : 138


#### Looks like 138 principal components is a good candidate value for k, let's apply it to spark's expected k components

<hr>

# Converting arrays to vector in the spark dataframe :
(Sparks PCA expects vectors)

In [19]:
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())

df_spark_vector = features_df.select(
    features_df["path"],
    features_df["label"], 
    list_to_vector_udf(features_df["features"]).alias("features")
)


In [20]:
df_spark_vector.printSchema()


root
 |-- path: string (nullable = true)
 |-- label: string (nullable = true)
 |-- features: vector (nullable = true)



In [21]:
k_components = reduced_features_values.shape[1]  # Number of PCs for 95% explained variance in sklearn's pca

spark_pca = spa_PCA(k=k_components, inputCol="features")
spark_pca.setOutputCol("reduced_features")


PCA_7793f7e5c52e

In [22]:
spark_pca_model = spark_pca.fit(df_spark_vector)
print(spark_pca_model.getK())


23/05/31 16:55:03 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/05/31 16:55:29 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


138


In [23]:
df_spark_vector = spark_pca_model.transform(dataset=df_spark_vector)


# Converting vectors back to arrays to be readable in python as such and not dict, saving as parquet

In [24]:
vector_to_array_udf = udf(
    lambda vector: vector.toArray().tolist() 
                  if isinstance(vector, (DenseVector, SparseVector)) 
                  else vector, 
    ArrayType(FloatType())
)

df_spark_vector = df_spark_vector.withColumn("features", vector_to_array_udf(df_spark_vector["features"]))
df_spark_vector = df_spark_vector.withColumn("reduced_features", vector_to_array_udf(df_spark_vector["reduced_features"]))


In [25]:
df_spark_vector.write.mode("overwrite").parquet(PATH_RESULT)


23/05/31 16:55:34 WARN DAGScheduler: Broadcasting large task binary with size 1639.4 KiB
                                                                                

# Checking correct save format :

In [26]:
df = pd.read_parquet(PATH_RESULT, engine="pyarrow")


In [27]:
df.head()


Unnamed: 0,path,label,features,reduced_features
0,file:/home/pierre/git/ds_p8/data/Test1/Waterme...,Watermelon,"[0.029508425, 0.06816274, 0.0, 0.016226953, 0....","[-5.2558584, 7.654522, 0.37388137, -0.05602401..."
1,file:/home/pierre/git/ds_p8/data/Test1/Raspber...,Raspberry,"[0.71335524, 1.3564988, 0.0, 0.0, 0.44015628, ...","[-2.14776, 4.961218, 0.41527277, 0.6306674, -0..."
2,file:/home/pierre/git/ds_p8/data/Test1/Cantalo...,Cantaloupe,"[0.027424365, 0.0, 0.0, 0.0, 0.0, 0.0, 0.15723...","[1.2081697, -3.053908, 0.6832527, -0.5004163, ..."
3,file:/home/pierre/git/ds_p8/data/Test1/Clement...,Clementine,"[0.113987595, 0.0, 0.0, 0.0, 0.024088217, 0.0,...","[8.2946825, -0.74385214, 0.8094477, -0.1389523..."
4,file:/home/pierre/git/ds_p8/data/Test1/Mangost...,Mangostan,"[0.7677538, 0.0, 0.0, 0.0, 0.10631378, 0.0, 0....","[4.673886, 9.500891, 0.38290235, 0.093648076, ..."


In [28]:
df.loc[0, "features"].shape


(1280,)

In [29]:
df.loc[0, "reduced_features"].shape


(138,)

Shows the initial shape of the features and the reduced features via PCA