In [3]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
import cv2
import sys
import random
sys.path.append('../')

random.seed(42)

In [4]:
from faces_clustering import Clusterer, FeatureExtractor, is_image, get_files_folder

Using TensorFlow backend.


# Extracting Features

In [None]:
complete_urls = get_files_folder("/home/paulo/Documentos/slnp_faces_leg_55/", is_image)

In [None]:
df_features_senet50 = FeatureExtractor('senet50').extract(complete_urls)
df_features_senet50.head(20)

In [None]:
df_features_senet50.to_pickle('../data/leg55_senet50.pkl')

In [None]:
df_features_resnet50 = FeatureExtractor('resnet50').extract(complete_urls)
df_features_resnet50.head(20)

In [None]:
df_features_resnet50.to_pickle('../data/leg55_resnet50.pkl')

In [None]:
df_features_vgg16 = FeatureExtractor('vgg16').extract(complete_urls)
df_features_vgg16.head(20)

In [None]:
df_features_vgg16.to_pickle('../data/leg55_vgg16.pkl')

# Clustering

In [26]:
backbone = 'vgg16'

In [27]:
df_embeddings = pd.read_pickle(f'../data/leg55_{backbone}.pkl')

In [31]:
df_embeddings.shape

(8619, 2)

In [29]:
classes = df_embeddings.urls.apply(lambda x: x.split('/')[-2])

In [30]:
valid_indexes = df_embeddings.embeddings.apply(lambda x: str(x) != '-')
df_embeddings = df_embeddings.loc[valid_indexes]

In [32]:
df_embeddings['classes'] = classes

unique_classes = list(df_embeddings.classes.unique())

samples_classes = random.sample(unique_classes, 5)

sample_indices =  df_embeddings.classes.apply(lambda x: x in samples_classes)

In [33]:
clusterer = Clusterer(n_clusters=514, face_embeddings = df_embeddings,algs=['kmeans', 'affinity', 'agglomerative'])

In [34]:
clusters, models_inst = clusterer.clusterize()



In [35]:
clusters.head()

Unnamed: 0,urls,embeddings,classes,cluster_kmeans,cluster_affinity,cluster_agglomerative
0,/home/paulo/Documentos/slnp_faces_leg_55/74385...,"[2.238014, 0.0, 0.0, 0.0, 0.0, 0.11067976, 0.0...",74385_b,297,-1,68
1,/home/paulo/Documentos/slnp_faces_leg_55/74385...,"[0.21053398, 4.4255977, 5.7136946, 9.44324, 7....",74385_b,58,-1,295
4,/home/paulo/Documentos/slnp_faces_leg_55/74385...,"[3.8263426, 2.259651, 3.8618968, 0.8609967, 0....",74385_b,58,-1,295
5,/home/paulo/Documentos/slnp_faces_leg_55/74385...,"[0.0, 7.6271586, 9.8688345, 8.535545, 2.086572...",74385_b,58,-1,295
6,/home/paulo/Documentos/slnp_faces_leg_55/74385...,"[0.16503549, 6.7819276, 0.5020972, 12.082533, ...",74385_b,332,-1,501


In [36]:
clusters.to_pickle(f'../data/leg55_clusters_{backbone}.pkl')

In [44]:
clusters.sort_values(by=['urls']).head()

Unnamed: 0,urls,embeddings,classes,cluster_kmeans,cluster_affinity,cluster_agglomerative
4263,/home/paulo/Documentos/slnp_faces_leg_55/10511...,"[3.763019, 1.1650515, 0.5744727, 6.1935024, 9....",105112_b,472,-1,226
4262,/home/paulo/Documentos/slnp_faces_leg_55/10511...,"[1.9997727, 1.535032, 0.131482, 5.673743, 0.11...",105112_b,472,-1,226
4270,/home/paulo/Documentos/slnp_faces_leg_55/10511...,"[5.559259, 3.0951445, 0.32168803, 8.773773, 0....",105112_b,472,-1,226
4268,/home/paulo/Documentos/slnp_faces_leg_55/10511...,"[3.4136424, 2.7559876, 1.0755774, 7.3527603, 7...",105112_b,268,-1,212
4267,/home/paulo/Documentos/slnp_faces_leg_55/10511...,"[7.1362076, 0.9659398, 1.0193014, 6.101046, 3....",105112_b,472,-1,226


In [45]:
models_inst['cluster_kmeans'].cluster_centers_

array([[0.27213111, 0.15236522, 0.03527031, ..., 0.07843432, 0.07503472,
        0.08202707],
       [0.05594779, 0.1122032 , 0.30774101, ..., 0.03042112, 0.31828403,
        0.1284229 ],
       [0.20816352, 0.12197001, 0.11006268, ..., 0.05341525, 0.0620157 ,
        0.14995413],
       ...,
       [0.21349073, 0.16398336, 0.09649953, ..., 0.0080157 , 0.13887246,
        0.126038  ],
       [0.32118677, 0.18064047, 0.14630446, ..., 0.01459945, 0.35589471,
        0.17697434],
       [0.12045684, 0.19420404, 0.06344562, ..., 0.02348973, 0.27273956,
        0.3093393 ]])

In [46]:
clusters.groupby(['cluster_kmeans']).count().describe()

Unnamed: 0,urls,embeddings,classes,cluster_affinity,cluster_agglomerative
count,514.0,514.0,514.0,514.0,514.0
mean,16.768482,16.768482,16.768482,16.768482,16.768482
std,12.542283,12.542283,12.542283,12.542283,12.542283
min,2.0,2.0,2.0,2.0,2.0
25%,9.0,9.0,9.0,9.0,9.0
50%,14.0,14.0,14.0,14.0,14.0
75%,20.0,20.0,20.0,20.0,20.0
max,104.0,104.0,104.0,104.0,104.0


# Evaluating

In [47]:
clusters = pd.read_pickle(f'../data/leg55_clusters_{backbone}.pkl')

In [48]:
from sklearn import metrics

In [49]:
cluster_cols = [c for c in clusters.columns if str(c).startswith('cluster')]

In [50]:
labels_true = clusters.classes.values

In [51]:
dict_results = {}
for col in cluster_cols:
    labels_pred = clusters[col].values
    
    dict_results[col] = [metrics.homogeneity_score(labels_true,labels_pred),
                        metrics.completeness_score(labels_true, labels_pred),
                        metrics.v_measure_score(labels_true, labels_pred)]

In [52]:
df_results = pd.DataFrame.from_dict(dict_results, orient='index', columns=['Homogeneity Score', 'Completeness Score', 'V-Measure'])
df_results

Unnamed: 0,Homogeneity Score,Completeness Score,V-Measure
cluster_kmeans,0.778515,0.79593,0.787126
cluster_affinity,-6.571624000000001e-17,1.0,-1.314325e-16
cluster_agglomerative,0.8361146,0.851264,0.8436214


In [53]:
df_results.to_csv(f'../results/{backbone}_results_removed.csv')