In [37]:
import io
import os
import pandas as pd
import numpy as np
import difflib 
import re

# Imports the Google Cloud client library
from google.cloud import vision
from google.cloud.vision import types

def setUpEnv():
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "creds.json"
    client = vision.ImageAnnotatorClient()
    return client

def getImagePaths(folder = 'images'):
    filenames = os.listdir(folder)
    paths = [ filename for filename in filenames if filename.endswith('.jpg') or filename.endswith('.png') ]
    paths = [ './' + folder + '/' + filename for filename in paths ]
    paths.reverse()
    return paths

def getClassesPathsFromFolders(folder = 'images'):
    classes = os.listdir(folder)
    classes = [c for c in classes if not c.startswith('.')]
    paths = [ folder + '/' + c for c in classes]
    return zip(classes, paths)

def readFileContentAsImage(imagePath):
    with io.open(imagePath, 'rb') as image_file:
        content = image_file.read()
    return types.Image(content=content)

def makeRequest(image):
    return {
      'image': image,
      'features': [
          {'type': vision.enums.Feature.Type.LABEL_DETECTION, 'max_results': 30},
          {'type': vision.enums.Feature.Type.TEXT_DETECTION},
          {'type': vision.enums.Feature.Type.IMAGE_PROPERTIES}
      ],
    }
    

def makeRequestBatch(paths):
    requests = []
    for path in paths:
        img = readFileContentAsImage(path)
        req = makeRequest(img)
        requests.append(req)
    return requests

def getPathBatchFromList(pathList, n = 5):
    batch = []
    for i in range (n):
        if not len(pathList) == 0:
            batch.append(pathList.pop())
    return batch

def addLabelsToDataFrameFromBatchResponse(dataFrame, response):
    for resp in response.responses:
        for label in resp.label_annotations:
            if label.description not in dataFrame:
                dataFrame[label.description] = 0
    return dataFrame

def appendScoresToDataFrameFromBatchResponse(dataFrame, response, target):
    for resp in response.responses:
        score_dict = {}
        for label in resp.label_annotations:
            score_dict[label.description] = label.score
        score_dict['target'] = target
        dataFrame = dataFrame.append(pd.Series(score_dict), ignore_index=True)
    return dataFrame
        
        
def removeNaN(dataFrame):
    return dataFrame.where(dataFrame.notna(), 0)


"""
TEXT ANALYSIS
"""

def getDetectedText(unique_response):
    return unique_response.full_text_annotation.text

def retrieveDetectedTextFromBatchResponse(storage, response, target):
    for resp in response.responses:
        saveDetectedText(storage, (target, getDetectedText(resp)))

def saveDetectedText(storage, text):
    storage.append((text[0], cleanDetectedText(text[1])))

def cleanDetectedText(sentence):
    return re.sub('[^A-Za-z0-9]+', ' ', sentence)

def getSequenceDifferenceRatio(a, b):
    matcher = difflib.SequenceMatcher(a=a, b=b, isjunk=lambda x: x == " ")
    return matcher.ratio()

In [41]:
# Prepare the Google Cloud client
client = setUpEnv()

# Our label dataframe
df = pd.DataFrame()

# How many element by batch
ratio = 5

# All the texts that will be detected
detectedTexts = [] 

folders = list(getClassesPathsFromFolders('img'))
for n, classInfo in enumerate(folders):
    allPaths = getImagePaths(classInfo[1])

    while len(allPaths) > 0:

        # Batch processing
        pathBatch = getPathBatchFromList(allPaths, ratio)
        requests = makeRequestBatch(pathBatch)
        response = client.batch_annotate_images(requests)

        # Labels processing
        df = addLabelsToDataFrameFromBatchResponse(df, response)
        df = appendScoresToDataFrameFromBatchResponse(df, response, classInfo[0])

        # Text processing
        retrieveDetectedTextFromBatchResponse(detectedTexts, response, classInfo[0])
        
    print ('step', n, 'on ', len(folders), 'done')


# Set the NaN values to 0
df = removeNaN(df)

# Set the first index to 1 instead of 0,
# to match the images name
df.index = np.arange(1, len(df) + 1)

df

step 0 on  4 done
step 0 on  4 done
step 0 on  4 done
step 0 on  4 done
step 0 on  4 done
step 1 on  4 done
step 1 on  4 done
step 1 on  4 done
step 1 on  4 done
step 2 on  4 done
step 2 on  4 done
step 2 on  4 done
step 2 on  4 done
step 2 on  4 done
step 3 on  4 done
step 3 on  4 done


Unnamed: 0,car,sky,building,family car,town,neighbourhood,landmark,city,vehicle,architecture,...,baptistery,arch,tourist attraction,ancient rome,ancient roman architecture,synagogue,triumphal arch,history,historic site,ancient history
1,0.983688,0.958008,0.932489,0.908125,0.906646,0.896086,0.893254,0.845955,0.831980,0.804722,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.961312,0.979581,0.924617,0.551899,0.934455,0.799456,0.908719,0.912322,0.672198,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.984792,0.979518,0.892973,0.887916,0.922145,0.814418,0.000000,0.886923,0.787451,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.982600,0.964232,0.904652,0.841221,0.924493,0.766031,0.920557,0.899901,0.768600,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.951767,0.972200,0.903640,0.787520,0.925116,0.838028,0.901910,0.885226,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,0.902466,0.973756,0.917515,0.000000,0.921609,0.718469,0.927665,0.928301,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,0.980069,0.982495,0.895506,0.673993,0.941062,0.801611,0.000000,0.892878,0.753750,0.795563,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,0.985370,0.967028,0.883750,0.910594,0.893770,0.689363,0.903527,0.871553,0.853758,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,0.982672,0.978244,0.887054,0.855926,0.931115,0.802970,0.000000,0.911504,0.773539,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
10,0.976758,0.922315,0.951817,0.656497,0.862688,0.825082,0.932258,0.806137,0.722705,0.811018,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [43]:
df.to_csv('data.csv')

In [22]:
folders

['img/palais_justice/', 'img/place_ange/', 'img/saint_aubain/', 'img/theatre/']