## Importations

In [1]:
import os
os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [2]:
import findspark
findspark.init()

In [3]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.amazonaws:aws-java-sdk-pom:1.10.34,org.apache.hadoop:hadoop-aws:2.7.2,databricks:spark-deep-learning:1.5.0-spark2.4-s_2.11 pyspark-shell'

In [4]:
# import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

## Configurations ...

In [5]:
path_cred = os.path.join(os.getcwd(),
            "AWS/AWS_IAM_CREDENTIAL/Maryse_P8_credentials.csv")

with open(path_cred,'r') as f:
        msg = f.read()
          
ID = str(msg).split('\n')[1].split(',')[2]
KEY = str(msg).split('\n')[1].split(',')[3]

# set "temporary" environment variables
os.environ["AWS_ACCESS_KEY_ID"]=ID
os.environ["AWS_SECRET_ACCESS_KEY"]=KEY

In [6]:
sc = SparkContext.getOrCreate()

hadoop_conf=sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3.impl",
                "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
hadoop_conf.set("fs.s3.awsAccessKeyId", ID)
hadoop_conf.set("fs.s3.awsSecretAccessKey", KEY)

## Instantiation of SparkContext and import sparkdl

In [7]:
spark = SparkSession.builder.appName('FeatExtr').getOrCreate()

In [8]:
import sparkdl

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


## Create a Spark DataFrame containing all the pictures

### Read images and vectorize

In [9]:
from pyspark.ml.image import ImageSchema

In [10]:
PREFIX = 'SAMPLE'

# # Option1: Get local data
# data_path = os.path.join("./DATA/fruits-360", PREFIX)
    
# Option2: Get data from s3
bucket='ocfruitpictures'
data_path = 's3://{}/{}'.format(bucket, PREFIX)

In [11]:
images_df = ImageSchema.readImages(data_path,
                                   recursive=True)

## Features extraction (Transfer Learning) using Sparkdl

In [12]:
from sparkdl import DeepImageFeaturizer

In [13]:
feat = DeepImageFeaturizer(inputCol="image",
                           outputCol="image_features",
                           modelName="ResNet50")

In [14]:
from pyspark.ml import Pipeline

pipe = Pipeline(stages=[feat])
extractor = pipe.fit(images_df)
ext_features_df = extractor.transform(images_df)

## PCA on the extracted features

In [15]:
from pyspark.ml.feature import PCA

pca = PCA(k=8,
          inputCol="image_features",
          outputCol="pca_features")
model = pca.fit(ext_features_df.select('image_features'))
pca_feat_df = model.transform(ext_features_df)

## Get the class of each image

In [16]:
import pyspark.sql.functions as pspfunc

orig_col = pca_feat_df['image']['origin']
split_col = pspfunc.split(orig_col, PREFIX+'/')
df_ = pca_feat_df.withColumn('labels', split_col.getItem(1))
split_col = pspfunc.split(df_['labels'], '/')
df_ = df_.withColumn('labels', split_col.getItem(0))
df_ = df_.withColumnRenamed("image", "path")

In [17]:
results_df = df_.select('path','pca_features','labels')

In [18]:
results_df.write.mode('overwrite').parquet("s3://ocfruitpictures/RESULTS_short")

In [19]:
# # path = "file:///home/maryse/PARTAGE/FORMATION/OCR_DS/PROJET8/Pultimate.parquet"
# path = "Pult.parquet"
# # path = "/home/maryse/Pult.parquet"
# results_df.write.mode('overwrite').parquet(path)