In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import sys
sys.path.append('../')
from faces_clustering import Clusterer, is_image, get_files_folder

In [None]:
import random
random.seed(42)

In [None]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [None]:
backbones = ['resnet50','senet50','vgg16']

# DataSeparation

In [None]:
df_embs = pd.read_pickle(f'../data/valid/raw/leg55_{backbones[0]}.pkl')

In [None]:
test_index = df_embs.sample(frac=1, random_state=42).groupby('classes').head(1).sort_values('classes').index

In [None]:
for backbone in backbones:
    df_backbone = pd.read_pickle(f'../data/valid/leg55_{backbone}.pkl')
    df_test = df_backbone.loc[test_index].copy()
    df_train = df_backbone.loc[df_embs.index.difference(test_index)].copy()
    
    df_train.to_pickle(f'../data/valid/train_leg55_{backbone}.pkl')
    df_test.to_pickle(f'../data/valid/test_leg55_{backbone}.pkl')

# Clustering Train

In [None]:
all_clusters = {}
for backbone in backbones:
    df_train = pd.read_pickle(f'../data/valid/train_leg55_{backbone}.pkl')
    clusterer = Clusterer(n_clusters=513, face_embeddings = df_train,algs=[
        'kmeans', 'affinity', 'agglomerative'])
    clusters, models_inst = clusterer.clusterize(normalize=False)
    
    all_clusters[backbone] = clusters

In [None]:
dt_clusters = pd.concat(all_clusters.values(), keys = all_clusters.keys())

In [None]:
dt_clusters.to_pickle('../data/valid/clusters_train_leg55.pkl')

In [None]:
dt_clusters = pd.read_pickle('../data/valid/clusters_train_leg55.pkl')

In [None]:
print(f"Train Size: {dt_clusters.loc['resnet50'].shape[0]}")

## Evaluating Train

In [None]:
from sklearn import metrics

In [None]:
df_all = {}
for backbone in backbones:
    clusters = dt_clusters.loc[backbone]
    
    cluster_cols = [c for c in clusters.columns if str(c).startswith('cluster')]
    labels_true = clusters.classes.values
    
    dict_results = {}
    for col in cluster_cols:
        labels_pred = clusters[col].values

        dict_results[col] = [metrics.homogeneity_score(labels_true,labels_pred),
                            metrics.completeness_score(labels_true, labels_pred),
                            metrics.v_measure_score(labels_true, labels_pred)]
    df_all[backbone] = pd.DataFrame.from_dict(dict_results, orient='index', columns=[
        'Homogeneity Score', 'Completeness Score', 'V-Measure'])
    
df_all_results = pd.concat(df_all.values(), keys=df_all.keys())
    

In [None]:
df_all_results

In [None]:
pd.read_csv('../results/valid/clusters.csv', index_col=[0,1])

# Test

In [None]:
from faces_clustering import FaceSearcher
backbone = backbones[1]

In [None]:
df_embs = pd.read_pickle('../data/valid/distances/senet50_agglomerative.pkl')

In [None]:
df_embs.shape

In [None]:
searcher = FaceSearcher(face_embs = df_embs, classes_col = 'cluster_agglomerative', distance_col = 'd_cluster_agglomerative')

In [None]:
df_test = pd.read_pickle('../data/valid/test_leg55_senet50.pkl')

In [None]:
df_test[['urls','classes']].loc[df_test['classes'] == '73674_b'].urls.values

In [None]:
print(f"Test size is {df_test.shape[0]}")

In [None]:
embs_query = [l.tolist() for l in df_test.embeddings.values]

In [None]:
_, results = searcher.closest_centroids(embs_query = embs_query)

In [None]:
softmax = lambda x: np.exp(x)/sum(np.exp(x))

In [None]:
def m1(x, k):
    m1 = softmax((1/x).nlargest(k)).head(1)
    return (m1.index.values[0],m1.values[0])

In [None]:
pcts = [m1(r,5)[1] for r in results]

In [None]:
np.mean(np.array(pcts)>=0.5)

In [None]:
np.mean(np.array(pcts))

In [None]:
import matplotlib.pyplot as plt

In [None]:
_ = plt.hist(pcts, bins=50)

In [None]:
df_test['results'] = results

In [None]:
mode_clusters = df_embs.groupby('cluster_agglomerative')['classes'].apply(lambda x: x.value_counts(dropna=False).idxmax())

In [None]:
classes_by_cluster = df_embs.groupby('cluster_agglomerative')['classes'].apply(set)

In [None]:
any_k = []
mode_k = []
has_class_k = []

for k in range(2,11):
    p_clusters = df_test['results'].apply(lambda x: m1(x,k)[0] if m1(x,k)[1] >= 0.5 else -1)
    any_k.append(np.mean(p_clusters != -1))
    #print(f"{k}: Assigned to any: {any_k[-1]}")
    mode_k.append((df_test.classes == p_clusters.map(mode_clusters)).mean())
    #print(f"{k}: Assigned to cluster of mode: {mode_k[-1]}")
    
    comp = []
    for myclass, classes in zip(df_test.classes,p_clusters.map(classes_by_cluster)):
        comp.append(classes == classes and myclass in classes)
    has_class_k.append(np.mean(comp))    
    #print(f"{k}: Assigned to cluster with class in the cluster {has_class_k[-1]}")

In [None]:
df_results = pd.DataFrame(np.transpose([any_k, mode_k, has_class_k]),
             index = range(2,11), columns=['Any_Cluster', 'Mode Equal to Class', 'Class is present in cluster'])
df_results

## Checking wrong classes

In [None]:
p_clusters = df_test['results'].apply(lambda x: m1(x,2)[0] if m1(x,2)[1] >= 0.5 else -1)

In [None]:
classes_wrong = df_test.loc[df_test.classes != p_clusters.map(mode_clusters)].classes

In [None]:
classes_right = df_test.loc[df_test.classes == p_clusters.map(mode_clusters)].classes

In [None]:
classes_count = df_embs.groupby('classes')['urls'].count()

In [None]:
classes_count.loc[classes_count<=5].count()

In [None]:
import matplotlib.pyplot as plt

_ = plt.hist([classes_wrong.map(classes_count), classes_count, classes_right.map(classes_count)], 
    density=False, bins=20, alpha=0.5, label=['wrong', 'all', 'right'])
_ = plt.legend()
#_ = plt.hist(, density=True, bins=12, alpha=0.5,label = 'class dist')

In [None]:
def fxs(x):
    if x < 6:
        return '1-5'
    if x < 11:
        return '6-10'
    if x < 16:
        return '11-15'
    if x < 21:
        return '16-20'
    if x < 26:
        return '21-25'
    return '26-30'

In [None]:
df_fxs_count = pd.concat([classes_wrong.map(classes_count).apply(fxs).value_counts(),
            classes_right.map(classes_count).apply(fxs).value_counts()], axis=1).reindex(
    ['1-5','6-10','11-15','16-20','21-25','26-30'])
df_fxs_count

In [None]:
df_fxs_count.columns = ['wrong','right']
df_fxs_count.plot.bar()

# LFW

In [None]:
from faces_clustering import FeatureExtractor

In [None]:
path = '/home/paulo/Documentos/0_LFW_dataset'

In [None]:
lfw = get_files_folder(path, criteria=is_image)

In [None]:
sample = random.sample(lfw, 513)

In [None]:
df_lfw = FeatureExtractor('senet50').extract(sample)
df_lfw.head(3)

In [None]:
lfw_query = [l.tolist() for l in df_lfw.embeddings.values]

In [None]:
_, results_lfw = searcher.closest_centroids(embs_query = lfw_query)

In [None]:
df_lfw['results'] = results_lfw

In [None]:
lfw = []
for k in range(2,11):
    lfw_clusters = df_lfw['results'].apply(lambda x: m1(x,k)[0] if m1(x,k)[1] >= 0.5 else -1)
    print(f"k: {k} {np.mean(lfw_clusters==-1)}")
    lfw.append(np.mean(lfw_clusters==-1))

In [None]:
df_results.index.name='alpha'
df_results['lfw assigned to no cluster'] = lfw

In [None]:
df_results.apply(lambda x: x.apply(lambda y: f"{y*100:.4f} %"))