In [6]:
import tensorflow as tf

physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
   tf.config.experimental.set_memory_growth(physical_devices[0], True)

# for loading/processing the images  
from keras.preprocessing.image import load_img 
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input 

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

from tqdm import tqdm

In [7]:

path = r"/home/kevin.bouchaud@Digital-Grenoble.local/code/Data_For_Good/images"
# change the working directory to the path where the images are located
os.chdir(path)

# this list holds all the image filename
images = [i for i in os.listdir(path) if i.endswith('.jpg')]

In [8]:

model = VGG16()
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224, 224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img)
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1, 224, 224, 3)
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

In [9]:
data = {}
pickle_path = r"/home/kevin.bouchaud@Digital-Grenoble.local/code/Data_For_Good/OpenFoodFacts/features.pkl"

# # lop through each image in the dataset
# for image in images:
#     # try to extract the features and update the dictionary
#     try:
#         feat = extract_features(image, model)
#         data[image] = feat
#     # if something fails, save the extracted features as a pickle file (optional)
#     except:
#         with open(p,'wb') as file:
#             pickle.dump(data, file)

for image in tqdm(images):
    feat = extract_features(image, model)
    data[image] = feat

with open(pickle_path, 'wb') as file:
    pickle.dump(data, file)

# get a list of the filenames
filenames = np.array(list(data.keys()))

# get a list of just the features
feat = np.array(list(data.values()))

# reshape so that there are 210 samples of 4096 vectors
feat = feat.reshape(-1, 4096)

100%|██████████| 32960/32960 [29:28<00:00, 18.64it/s]


In [None]:
# # get the unique labels (from the flower_labels.csv)
# df = pd.read_csv('flower_labels.csv')
# label = df['label'].tolist()
# unique_labels = list(set(label))

In [15]:
# reduce the amount of dimensions in the feature vector
pca = PCA(n_components=100, random_state=42)
pca.fit(feat)
x = pca.transform(feat)

pickle_reduced_path = r"/home/kevin.bouchaud@Digital-Grenoble.local/code/Data_For_Good/OpenFoodFacts/reduced_features.pkl"

data_reduced = {}
for i, image in enumerate(images):
    data_reduced[image] = x[i]

with open(pickle_reduced_path, 'wb') as file:
    pickle.dump(data_reduced, file)

32960it [00:00, 794557.47it/s]


In [13]:
feat.shape

(32960, 4096)

In [14]:
x.shape

(32960, 100)

In [None]:


# # cluster feature vectors
# kmeans = KMeans(n_clusters=len(unique_labels),n_jobs=-1, random_state=22)
# kmeans.fit(x)

# # holds the cluster id and the images { id: [images] }
# groups = {}
# for file, cluster in zip(filenames,kmeans.labels_):
#     if cluster not in groups.keys():
#         groups[cluster] = []
#         groups[cluster].append(file)
#     else:
#         groups[cluster].append(file)

# # function that lets you view a cluster (based on identifier)        
# def view_cluster(cluster):
#     plt.figure(figsize = (25,25));
#     # gets the list of filenames for a cluster
#     files = groups[cluster]
#     # only allow up to 30 images to be shown at a time
#     if len(files) > 30:
#         print(f"Clipping cluster size from {len(files)} to 30")
#         files = files[:29]
#     # plot each image in the cluster
#     for index, file in enumerate(files):
#         plt.subplot(10,10,index+1);
#         img = load_img(file)
#         img = np.array(img)
#         plt.imshow(img)
#         plt.axis('off')
        
   
# # this is just incase you want to see which value for k might be the best 
# sse = []
# list_k = list(range(3, 50))

# for k in list_k:
#     km = KMeans(n_clusters=k, random_state=22, n_jobs=-1)
#     km.fit(x)
    
#     sse.append(km.inertia_)

# # Plot sse against k
# plt.figure(figsize=(6, 6))
# plt.plot(list_k, sse)
# plt.xlabel(r'Number of clusters *k*')
# plt.ylabel('Sum of squared distance');