In [1]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
import cv2
import sys
import random
sys.path.append('../')

random.seed(42)

In [2]:
from faces_clustering import Clusterer, FeatureExtractor, is_image, get_files_folder

Using TensorFlow backend.


# Extracting Features

In [64]:
complete_urls = get_files_folder("/home/paulo/Documentos/slnp_faces_leg_55/", is_image)

In [None]:
df_features_senet50 = FeatureExtractor('senet50').extract(complete_urls)
df_features_senet50.head(20)

In [None]:
df_features_senet50.to_pickle('../data/leg55_senet50.pkl')

In [None]:
df_features_resnet50 = FeatureExtractor('resnet50').extract(complete_urls)
df_features_resnet50.head(20)

In [None]:
df_features_resnet50.to_pickle('../data/leg55_resnet50.pkl')

In [None]:
df_features_vgg16 = FeatureExtractor('vgg16').extract(complete_urls)
df_features_vgg16.head(20)

In [None]:
df_features_vgg16.to_pickle('../data/leg55_vgg16.pkl')

# Clustering

In [None]:
backbone = 'vgg16'

In [None]:
df_embeddings = pd.read_pickle(f'../data/leg55_{backbone}.pkl')

In [None]:
df_embeddings.shape

In [None]:
classes = df_embeddings.urls.apply(lambda x: x.split('/')[-2])

In [None]:
valid_indexes = df_embeddings.embeddings.apply(lambda x: str(x) != '-')
df_embeddings = df_embeddings.loc[valid_indexes]

In [None]:
df_embeddings['classes'] = classes

unique_classes = list(df_embeddings.classes.unique())

samples_classes = random.sample(unique_classes, 5)

sample_indices =  df_embeddings.classes.apply(lambda x: x in samples_classes)

In [None]:
clusterer = Clusterer(n_clusters=514, face_embeddings = df_embeddings,algs=['kmeans', 'affinity', 'agglomerative'])

In [None]:
clusters, models_inst = clusterer.clusterize()

In [None]:
clusters.head()

In [None]:
clusters.to_pickle(f'../data/leg55_clusters_{backbone}.pkl')

In [None]:
clusters.sort_values(by=['urls']).head()

In [None]:
models_inst['cluster_kmeans'].cluster_centers_

In [None]:
clusters.groupby(['cluster_kmeans']).count().describe()

# Evaluating

In [None]:
clusters = pd.read_pickle(f'../data/leg55_clusters_{backbone}.pkl')

In [None]:
from sklearn import metrics

In [None]:
cluster_cols = [c for c in clusters.columns if str(c).startswith('cluster')]

In [None]:
labels_true = clusters.classes.values

In [None]:
dict_results = {}
for col in cluster_cols:
    labels_pred = clusters[col].values
    
    dict_results[col] = [metrics.homogeneity_score(labels_true,labels_pred),
                        metrics.completeness_score(labels_true, labels_pred),
                        metrics.v_measure_score(labels_true, labels_pred)]

In [None]:
df_results = pd.DataFrame.from_dict(dict_results, orient='index', columns=['Homogeneity Score', 'Completeness Score', 'V-Measure'])
df_results

In [None]:
df_results.to_csv(f'../results/{backbone}_results_removed.csv')

# Checking Images that do not have embeddings

In [38]:
df = pd.read_pickle('../data/no_valid/leg55_senet50.pkl')

In [39]:
df.shape

(9119, 2)

In [40]:
urls_no_embs = df.loc[(df.urls.isin(complete_urls))&(df.embeddings=='-')].urls

  result = libops.scalar_compare(x.ravel(), y, op)


In [8]:
new_urls = urls_no_embs.apply(lambda x: f"/home/paulo/Documentos/no_embs/{x.split('/')[-2]}x{x.split('/')[-1]}")

In [9]:
extractor = FeatureExtractor('senet50')

In [36]:
check_urls = get_files_folder("/home/paulo/Documentos/no_embs/", is_image)

In [43]:
import os

In [44]:
from tqdm import tqdm

for url in tqdm(urls_no_embs.values):
    
    pixels = cv2.imread(url)
    pixels_rgb = cv2.cvtColor(pixels, cv2.COLOR_BGR2RGB)
    results = extractor.detector.detect_faces(pixels_rgb)
    
    faces = []
    for result in results:
        if result['confidence'] >= 0.9:
            x1, y1, width, height = result['box']
            x2, y2 = x1 + width, y1 + height
            x1 = max(x1,0)
            y1 = max(y1,0)
            face = pixels_rgb[y1:y2, x1:x2]

            if face.shape[0] > 0 and face.shape[1] > 0:
                faces.append('cv2.resize(face, required_size)')
    if len(faces) <= 0:
        os.remove(url)
        #print(url)
        #print(results)

100%|██████████| 494/494 [00:59<00:00,  8.32it/s]


In [65]:
url_series = pd.Series(complete_urls)

In [66]:
classes = url_series.apply(lambda x: x.split('/')[-2])

In [67]:
classes.value_counts().loc[classes.value_counts() <= 5]

178851_p    5
178955_p    5
67312_b     5
72442_b     5
152610_p    5
141440_b    5
160510_p    4
171617_p    4
141335_b    4
178958_p    4
74655_b     3
178898_p    3
178943_p    3
73931_b     2
dtype: int64