## Importations

In [1]:
import findspark
findspark.init()

In [2]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.amazonaws:aws-java-sdk-pom:1.10.34,org.apache.hadoop:hadoop-aws:2.7.2,databricks:spark-deep-learning:1.5.0-spark2.4-s_2.11 pyspark-shell'

## Configurations ...

In [3]:
path_cred = os.path.join(os.getcwd(),
            "AWS/AWS_IAM_CREDENTIAL/Maryse_P8_credentials.csv")

with open(path_cred,'r') as f:
        msg = f.read()
          
ID = str(msg).split('\n')[1].split(',')[2]
KEY = str(msg).split('\n')[1].split(',')[3]

os.environ["AWS_ACCESS_KEY_ID"]=ID
os.environ["AWS_SECRET_ACCESS_KEY"]=KEY

In [4]:
from pyspark import SparkContext, SparkConf

sc = SparkContext.getOrCreate()

Exception: Java gateway process exited before sending its port number

## Instantiation of SparkSession and import sparkdl

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('FeatExtr').getOrCreate()

import sparkdl

## Create a Spark DataFrame containing all the pictures

### Read images and vectorize

In [None]:
from pyspark.ml.image import ImageSchema

In [None]:
PREFIX = 'SAMPLE'

# # Option1: Get local data
# data_path = os.path.join("./DATA/fruits-360", PREFIX)
    
# Option2: Get data from s3
bucket='ocfruitpictures'
data_path = 's3a://{}/{}'.format(bucket, PREFIX)

In [None]:
images_df = ImageSchema.readImages(data_path,
                                   recursive=True)

## Features extraction (Transfer Learning) using Sparkdl

In [None]:
from sparkdl import DeepImageFeaturizer

In [None]:
feat = DeepImageFeaturizer(inputCol="image",
                           outputCol="image_features",
                           modelName="ResNet50")

In [None]:
from pyspark.ml import Pipeline

pipe = Pipeline(stages=[feat])
extractor = pipe.fit(images_df)
ext_features_df = extractor.transform(images_df)

## PCA on the extracted features

In [None]:
from pyspark.ml.feature import PCA

pca = PCA(k=8,
          inputCol="image_features",
          outputCol="pca_features")
model = pca.fit(ext_features_df.select('image_features'))
pca_feat_df = model.transform(ext_features_df)

## Get the class of each image

In [None]:
import pyspark.sql.functions as pspfunc

orig_col = pca_feat_df['image']['origin']
split_col = pspfunc.split(orig_col, PREFIX+'/')
df_ = pca_feat_df.withColumn('labels', split_col.getItem(1))
split_col = pspfunc.split(df_['labels'], '/')
df_ = df_.withColumn('labels', split_col.getItem(0))
df_ = df_.withColumnRenamed("image", "path")

In [None]:
results_df = df_.select('path','pca_features','labels')

In [None]:
results_df.write.mode('overwrite').parquet("s3://ocfruitpictures/RESULTS_short")