In [None]:
import boto3
import botocore.session
from boto3.s3.transfer import TransferConfig
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract, col, split, udf, \
                                  split, monotonically_increasing_id
from pyspark.ml.feature import StringIndexer
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import StringType, ArrayType, DoubleType
import io
from PIL import Image
import numpy as np
from pyspark.mllib.feature import StandardScaler, StandardScalerModel
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.feature import StandardScaler, PCA
from datetime import datetime
from pyspark.ml import Pipeline
from pyspark.ml.functions import vector_to_array
import re

In [None]:
# Tensorflow 2 = INFO and WARNING messages are not printed

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from keras.applications.vgg16 import VGG16

<h1>Set UP

<h6>Pyspark Config

In [None]:
session = botocore.session.get_session()
credentials = session.get_credentials()

os.environ['PYSPARK_SUBMIT_ARGS'] = \
    '--packages com.amazonaws:aws-java-sdk-pom:1.12.249,org.apache.hadoop:hadoop-aws:3.3.1 pyspark-shell'

In [None]:
spark = SparkSession \
    .builder\
    .appName("ETL Pipeline")\
    .master("local[2]")\
    .config("spark.hadoop.fs.s3a.access.key", credentials.access_key)\
    .config("spark.hadoop.fs.s3a.secret.key", credentials.secret_key)\
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\
    .getOrCreate()

In [None]:
spark

<h6>Bucket

In [None]:
s3_object_path = "s3a://oc-project-8-bucket/Sub_S3/**"

<h6> Tensorflow Model (VGG16)

In [None]:
model = VGG16(weights="imagenet",
                     pooling='max', 
                     include_top=False,
                     input_shape=(224, 224, 3))

for layer in model.layers:
            layer.trainable = False

model.compile()
model.summary()

<h6>UDF

In [None]:
# Functions

def feature_extract(bytes_type):

    lambda_list = np.asarray(Image.open(io.BytesIO(bytes_type)).resize((224,224)))
    
    vectorized = model.predict(np.array([lambda_list])).tolist()[0]
                             
    return vectorized

def extract_label(path):
    
    splitted_list = re.split('/', path)
    
    return splitted_list[4]

# UDFs

feature_extract_UDF = udf(lambda x: feature_extract(x), ArrayType(DoubleType()))

to_vector_UDF = udf(lambda x: Vectors.dense(x), VectorUDT())

label_extract_UDF = udf(lambda path: extract_label(path), StringType())

<h1>Preprocessing

In [None]:
df = spark.read.format("binaryFile").load(s3_object_path)

In [None]:
feature_extracted_df = df.select(feature_extract_UDF(df.content).alias("features"))

In [None]:
vector_df = feature_extracted_df.select(to_vector_UDF('features').alias('features'))

<h1>Dimentionality Reduction

In [None]:
n_components = 137

std = StandardScaler(inputCol="features", outputCol="scaled")
pca = PCA(inputCol="scaled", outputCol="pca").setK(n_components)
stages = [std, pca]

In [None]:
pipeline = Pipeline().setStages(stages)

In [None]:
pca_df = pipeline.fit(vector_df).transform(vector_df)

In [None]:
pca_df_multiple_columns = pca_df.withColumn("pc", vector_to_array("pca"))\
                          .select([col("pc")[i] for i in range(n_components)])

In [None]:
label_df = df.select(label_extract_UDF(df.path).alias("labels"))

In [None]:
pca_df_multiple_columns_id = \
    pca_df_multiple_columns.withColumn('id', monotonically_increasing_id())

label_df_id = label_df.withColumn('id', monotonically_increasing_id())

In [None]:
labeled_pca_df = label_df_id.join(pca_df_multiple_columns_id, on=['id'], how='inner').drop('id')

In [None]:
csv = labeled_pca_df.toPandas().to_csv()

In [None]:
bucket_name = 'oc-project-8-pca-csv'

# boto3.client('s3')\
#      .create_bucket(Bucket=bucket_name,
#                     CreateBucketConfiguration={'LocationConstraint':'eu-west-3'})

boto_config = TransferConfig(multipart_threshold=1024 * 25, max_concurrency=10, 
                             multipart_chunksize=1024 * 25, use_threads=True)

boto3.resource('s3').meta.client.upload_fileobj(Fileobj=io.BytesIO(str.encode(csv)), 
                                                Bucket=bucket_name, 
                                                Key='pca.csv',
                                                ExtraArgs={'ContentType':'text/csv', 
                                                           'ACL':'private'},
                                                Config=boto_config)