In [None]:
import boto3
import botocore.session
from boto3.s3.transfer import TransferConfig
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract, col, split, udf
from pyspark.ml.feature import StringIndexer
from pyspark import SparkConf, SparkContext
import io
from PIL import Image
import numpy as np
from pyspark.mllib.feature import StandardScaler, StandardScalerModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix

In [None]:
# Tensorflow 2 = INFO and WARNING messages are not printed

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from keras.applications.vgg16 import VGG16

<h1>Set UP

In [None]:
session = botocore.session.get_session()
credentials = session.get_credentials()

os.environ['PYSPARK_SUBMIT_ARGS'] = \
    '--packages com.amazonaws:aws-java-sdk-pom:1.12.249,org.apache.hadoop:hadoop-aws:3.3.1 pyspark-shell'

In [None]:
conf = SparkConf()\
       .setAppName("ETL Pipeline")\
       .setMaster("local[2]")\
       .set("spark.hadoop.fs.s3a.access.key", credentials.access_key)\
       .set("spark.hadoop.fs.s3a.secret.key", credentials.secret_key)\
       .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

In [None]:
sc = SparkContext(conf=conf).getOrCreate()
sc.setLogLevel("ERROR")

In [None]:
sc

In [None]:
rdd_bytes_images = sc.binaryFiles("s3a://oc-project-8-bucket/Sub_S3/**")

<h1>Feature Extraction

In [None]:
rdd_array_image = rdd_bytes_images.map(lambda x: np.asarray(Image.open(io.BytesIO(x[1])).resize((224,224))))

In [None]:
model = VGG16(weights="imagenet", 
                     pooling='max', 
                     include_top=False, 
                     input_shape=(224, 224, 3))

for layer in model.layers:
            layer.trainable = False

model.compile()
model.summary()

In [None]:
rdd_extracted_features = rdd_array_image.map(lambda x : model.predict(np.array([x])))

In [None]:
rdd_extracted_features.take(2)

<h1>Dimentionality Reduction

<h6>N.B : As the feature extraction space is on the same scale, standardisation is not necessary to perform a proper PCA. This allows us to not trigger the standardisation action and then imporve performance of our Pipeline. 

In [None]:
matrix = RowMatrix(rdd_extracted_features)

In [None]:
pca = matrix.computePrincipalComponents(137)

In [None]:
reducted_matrix = matrix.multiply(pca).rows.map(lambda x : x.toArray().tolist())

<h1>CSV Export

In [None]:
spark = SparkSession(sc)

In [None]:
csv = reducted_matrix.toDF().toPandas().to_csv()

In [None]:
bucket_name = 'oc-project-8-pca-csv'

# boto3.client('s3')\
#      .create_bucket(Bucket=bucket_name,
#                     CreateBucketConfiguration={'LocationConstraint':'eu-west-3'})

boto_config = TransferConfig(multipart_threshold=1024 * 25, max_concurrency=10, 
                             multipart_chunksize=1024 * 25, use_threads=True)

boto3.resource('s3').meta.client.upload_fileobj(Fileobj=io.BytesIO(str.encode(csv)), 
                                                Bucket=bucket_name, 
                                                Key='pca.csv',
                                                ExtraArgs={'ContentType':'text/csv', 
                                                           'ACL':'private'},
                                                Config=boto_config)