In [1]:
import functools
import multiprocessing as mp
import re
import os
from keras.preprocessing import image
import PIL.Image
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.models import Model
import numpy as np
from collections import OrderedDict
from sklearn.decomposition import PCA
from scipy.spatial import distance
from scipy.cluster import hierarchy
import shutil

Using TensorFlow backend.


In [2]:
pj = os.path.join

In [3]:
def get_files(dr, ext='jpg|jpeg|bmp|png'):
    rex = re.compile(r'^.*\.({})$'.format(ext), re.I)
    return [os.path.join(dr,base) for base in os.listdir(dr) if rex.match(base)]

In [4]:
def _img_worker(fn, size):
    return fn, image.img_to_array(PIL.Image.open(fn).resize(size, 3))

In [5]:
def image_array(data_path, size):
    _f = functools.partial(_img_worker, size=size)
    with mp.Pool(mp.cpu_count()) as pool:
        ret = pool.map(_f, get_files(data_path))
    return dict(ret)

In [6]:
def get_model(layer='fc2'):
    base_model = VGG16(weights='imagenet', include_top=True)
    #base_model.summary()
    model = Model(inputs=base_model.input, outputs=base_model.get_layer(layer).output)
    #model.summary()
    return model

In [7]:
def fingerprint(img_arr, model):
    if img_arr.shape[2] == 1:
        img_arr = img_arr.repeat(3, axis=2)
        
    arr4d = np.expand_dims(img_arr, axis=0)
    arr4d_pp = preprocess_input(arr4d)
    return model.predict(arr4d_pp)[0,:]

In [8]:
def fingerprints(ias, model):
    fps = {}
    for fn, img_arr in ias.items():
        fps[fn] = fingerprint(img_arr, model)
        
    return fps

In [9]:
def pca(fps, n_components=0.9, **kwds):
    if 'n_components' not in kwds.keys():
        kwds['n_components'] = n_components
    _fps = OrderedDict(fps)
    X = np.array(list(_fps.values()))
    if X.shape[0] == 0:
        return -1
    XP = PCA(**kwds).fit(X).transform(X)
    return {k:v for k, v in zip(_fps.keys(), XP)}

In [10]:
def cluster_stats(clusters):
    return np.array([[k, len(clusters[k])] for k in np.sort(list(clusters.keys()))], dtype=int)

In [11]:
def print_cluster_stats(clusters):
    print("#images : #clusters")
    stats = cluster_stats(clusters)
    for csize, cnum in stats:
        print(f"{csize} : {cnum}")
    if stats.shape[0] > 0:
        nimg = stats.prod(axis=1).sum()
    else:
        nimg = 0
    print("#images in clusters total: ", nimg)

In [12]:
def cluster(fps, sim=0.5, method='average', metric='euclidean', extra_out=False, print_stats=True, min_csize=2):
    assert 0 <= sim <= 1
    assert min_csize >= 1
    files = list(fps.keys())
    
    dfps = distance.pdist(np.array(list(fps.values())), metric)
    Z = hierarchy.linkage(dfps, method=method, metric=metric)
    cut = hierarchy.fcluster(Z, t=dfps.max()*(1.0*sim), criterion='distance')
    cluster_dict = dict((ii, []) for ii in np.unique(cut))
    for iimg, iclus in enumerate(cut):
        cluster_dict[iclus].append(files[iimg])
        
    clusters = {}
    for cluster in cluster_dict.values():
        csize = len(cluster)
        if csize >= min_csize:
            if not (csize in clusters.keys()):
                clusters[csize] = [cluster]
            else:
                clusters[csize].append(cluster)
                
    if print_stats:
        print_cluster_stats(clusters)
    if extra_out:
        extra={'Z':Z, 'dfps': dfps, 'cluster_dict': cluster_dict, 'cut':cut}
        return clusters, extra
    else:
        return clusters

In [13]:
def make_links(clusters, cluster_dir):
    print("cluster dir: {}".format(cluster_dir))
    if os.path.exists(cluster_dir):
        shutil.rmtree(cluster_dir)
    for csize, group in clusters.items():
        for iclus, cluster in enumerate(group):
            dr = pj(cluster_dir, 'cluster_with_{}'.format(csize), 'cluster_{}'.format(iclus))
            for fn in cluster:
                link = pj(dr, os.path.basename(fn))
                os.makedirs(os.path.dirname(link), exist_ok=True)
                os.symlink(os.path.abspath(fn), link)

In [14]:
data_path = '/mnt/disks/slow1/video_processing/frames/Luarent_data/'
size = (224, 224)
model = get_model()

for sub_fold in os.listdir(data_path):
    sub_path = data_path + sub_fold
    print(sub_path)
    for sub_sub_fold in os.listdir(sub_path):
        sub_sub_path = sub_path + '/' + sub_sub_fold
        print(sub_sub_path)

        ias = image_array(sub_sub_path, size)

        fps = fingerprints(ias, model)
        
        fps = pca(fps, n_components=0.95)
        if fps != -1:
            clusters = cluster(fps, sim=0.5)
        
            make_links(clusters, sub_sub_path+'/imagecluster/clusters')

/mnt/disks/slow1/video_processing/frames/Luarent_data/112203
/mnt/disks/slow1/video_processing/frames/Luarent_data/112203/000106
#images : #clusters
52 : 1
78 : 1
120 : 1
#images in clusters total:  250
cluster dir: /mnt/disks/slow1/video_processing/frames/Luarent_data/112203/000106/imagecluster/clusters
/mnt/disks/slow1/video_processing/frames/Luarent_data/112203/000104
#images : #clusters
3 : 1
6 : 1
9 : 1
16 : 1
49 : 1
167 : 1
#images in clusters total:  250
cluster dir: /mnt/disks/slow1/video_processing/frames/Luarent_data/112203/000104/imagecluster/clusters
/mnt/disks/slow1/video_processing/frames/Luarent_data/112203/000054
#images : #clusters
15 : 1
37 : 1
41 : 1
59 : 1
98 : 1
#images in clusters total:  250
cluster dir: /mnt/disks/slow1/video_processing/frames/Luarent_data/112203/000054/imagecluster/clusters
/mnt/disks/slow1/video_processing/frames/Luarent_data/112203/000008
#images : #clusters
7 : 1
14 : 1
23 : 1
30 : 1
32 : 1
33 : 1
36 : 2
38 : 1
#images in clusters total:  24