In [None]:
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display
import boto3
import botocore.session
from boto3.s3.transfer import TransferConfig
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract, col, split, udf, \
                                  split, monotonically_increasing_id
from pyspark.ml.feature import StringIndexer
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import StringType, ArrayType, DoubleType
import io
from PIL import Image
import numpy as np
from pyspark.mllib.feature import StandardScaler, StandardScalerModel
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.feature import StandardScaler, PCA
from datetime import datetime
from pyspark.ml import Pipeline
from pyspark.ml.functions import vector_to_array
import re
import time
import pandas as pd

In [None]:
# Sets the Ipython options

InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Tensorflow 2 = INFO and WARNING messages are not printed

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from keras.applications.vgg16 import VGG16

<h1>Set UP

<h6>Spark Config

In [None]:
session = botocore.session.get_session()
credentials = session.get_credentials()

os.environ['PYSPARK_SUBMIT_ARGS'] = \
    '--packages com.amazonaws:aws-java-sdk-pom:1.12.249,org.apache.hadoop:hadoop-aws:3.3.1 pyspark-shell'

<h6> Tensorflow Model (VGG16)

In [None]:
model = VGG16(weights="imagenet",
                     pooling='max', 
                     include_top=False,
                     input_shape=(224, 224, 3))

for layer in model.layers:
            layer.trainable = False

model.compile()
model.summary()

<h6>UDFs

In [None]:
# Functions

def feature_extract(bytes_type):

    lambda_list = np.asarray(Image.open(io.BytesIO(bytes_type)).resize((224,224)))
    
    vectorized = model.predict(np.array([lambda_list])).tolist()[0]
                             
    return vectorized

def extract_label(path):
    
    splitted_list = re.split('/', path)
    
    return splitted_list[4]

# UDFs

feature_extract_UDF = udf(lambda x: feature_extract(x), ArrayType(DoubleType()))

to_vector_UDF = udf(lambda x: Vectors.dense(x), VectorUDT())

label_extract_UDF = udf(lambda path: extract_label(path), StringType())

<h1>Monitoring Loop

One have to run manualy as many time this script as it wants to try different number of workers e.g : local[1] local[2] etc.
Indeed, none of the following option spark.sparkContext.cancelAllJobs(), spark.stop(), spark.sparkContext.stop(), time.sleep(60) would allow to loop different number of workers in a single loop. 

In [None]:
workers = 4

spark = SparkSession \
        .builder\
        .appName("ETL Pipeline")\
        .master("local[{}]".format(workers))\
        .config("spark.hadoop.fs.s3a.access.key", credentials.access_key)\
        .config("spark.hadoop.fs.s3a.secret.key", credentials.secret_key)\
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\
        .getOrCreate()

spark

In [None]:
log_df = pd.DataFrame(columns=['workers', 'images_size', 'dimentionality','time'])

In [None]:
i = 0

for image_size in [100,200,300,500,1000]:

    s3_object_path = 's3a://oc-projet-8-monitoring/Monitoring/{}/**'.format(image_size)

    print(image_size)
    
    df = spark.read.format("binaryFile").load(s3_object_path)

    feature_extracted_df = df.select(feature_extract_UDF(df.content).alias("features"))

    vector_df = feature_extracted_df.select(to_vector_UDF('features').alias('features'))

    n_components = 100

    std = StandardScaler(inputCol="features", outputCol="scaled")
    pca = PCA(inputCol="scaled", outputCol="pca").setK(n_components)
    pca_only = PCA(inputCol="features", outputCol="pca").setK(n_components)

    for stages_select in [[pca_only], [std, pca]]:
        
        i += 1

        pipeline = Pipeline().setStages(stages_select)
        
        start = datetime.now()

        pca_df = pipeline.fit(vector_df).transform(vector_df)

        pca_df_multiple_columns = pca_df.withColumn("pc", vector_to_array("pca"))\
                                          .select([col("pc")[i] for i in range(n_components)])

        label_df = df.select(label_extract_UDF(df.path).alias("labels"))

        pca_df_multiple_columns_id = \
            pca_df_multiple_columns.withColumn('id', monotonically_increasing_id())

        label_df_id = label_df.withColumn('id', monotonically_increasing_id())

        labeled_pca_df = label_df_id.join(pca_df_multiple_columns_id, on=['id'], how='inner').drop('id')

        labeled_pca_df.coalesce(1)\
                   .write.mode('overwrite')\
                   .save("s3a://oc-project-8-pca-csv/monitoring", format='csv', header=True)
        
        end = datetime.now()
        
        total_time = end - start
        
        log_df.loc[i, ['workers', 'images_size', 'dimentionality','time']] = \
            workers, image_size, stages_select, total_time

In [None]:
log_df