In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from IPython.display import Image
import random

In [2]:
face_embeddings = pd.read_pickle('faces_embeddings.pkl')

## Quantity of Pictures Histogram

In [3]:
qtd_images = face_embeddings[["name", "file"]].groupby('name').count()

In [4]:
qtd_images = qtd_images.loc[qtd_images['file'] > 5]
qtd_images = qtd_images.loc[qtd_images['file'] <= 100]

In [5]:
qtd_images.head()

Unnamed: 0_level_0,file
name,Unnamed: 1_level_1
Abdullah_Gul,19
Adrien_Brody,12
Al_Gore,8
Al_Sharpton,7
Albert_Costa,6


In [6]:
qtd_images.plot.hist(alpha=0.3, density = True, figsize=(12,7), bins=30)

<matplotlib.axes._subplots.AxesSubplot at 0x7f07ebb89358>

## Selecting People

In [7]:
people = qtd_images.index.values
len(people)

305

In [8]:
random.shuffle(people)
people = people[:20]
people

array(['Bill_Frist', 'Vicente_Fox', 'Tang_Jiaxuan', 'Michael_Jackson',
       'Geoff_Hoon', 'Billy_Crystal', 'Jean_Chretien', 'Adrien_Brody',
       'Robert_Blake', 'Joan_Laporta', 'Wen_Jiabao', 'Jackie_Chan',
       'Binyamin_Ben-Eliezer', 'Paul_Bremer', 'Oscar_De_La_Hoya',
       'Ali_Naimi', 'Ben_Affleck', 'Javier_Solana', 'JK_Rowling',
       'Jeremy_Greenstock'], dtype=object)

In [9]:
face_embeddings = face_embeddings.loc[face_embeddings['name'].isin(people)]

## Features Separation

In [10]:
features = pd.DataFrame(face_embeddings['embeddings'].values.tolist(), index = face_embeddings.index)

In [11]:
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
386,0.686668,0.00406,0.0,0.0,0.0,0.512069,0.195294,2.246523,0.0,0.0,...,0.0,10.033471,0.0,0.716562,0.0,0.003659,0.200747,9.437379,0.155978,0.657483
387,0.0,0.0,0.426455,0.0,0.068948,0.739522,0.0,1.448166,0.0,0.271009,...,0.384982,10.480258,0.933719,0.019676,0.0,0.006259,0.20797,0.126601,2.732169,0.0
388,0.996497,0.0,0.0,0.0,0.0,1.986045,0.104063,0.373939,0.0,1.079655,...,0.0,1.463654,0.011812,0.018215,0.0,0.015416,0.831652,15.155327,2.519985,0.0
389,0.014096,0.0,0.010839,0.024752,0.0,0.005541,0.0,0.674804,0.352952,0.0,...,0.0,6.376276,1.437224,3.007647,0.224449,1.001146,0.0,7.96563,0.058068,0.029271
390,0.0,0.0,0.451854,0.0,0.0,0.0,0.0,0.073286,0.005137,0.169723,...,0.0,10.010357,5.72706,2.24719,0.0,0.542747,0.0,0.027258,0.0,0.0


## Scaling

In [12]:
scaler = MinMaxScaler()

features_scaled = scaler.fit_transform(features)

## Clustering

In [13]:
model = KMeans(n_clusters=len(people), verbose = 0)
#model = MeanShift(bandwidth = len(people))

In [14]:
cluster_labels = model.fit_predict(features_scaled)
#cluster_labels = kmeans.fit_predict(features)

In [15]:
features['Cluster'] = cluster_labels
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,Cluster
386,0.686668,0.00406,0.0,0.0,0.0,0.512069,0.195294,2.246523,0.0,0.0,...,10.033471,0.0,0.716562,0.0,0.003659,0.200747,9.437379,0.155978,0.657483,9
387,0.0,0.0,0.426455,0.0,0.068948,0.739522,0.0,1.448166,0.0,0.271009,...,10.480258,0.933719,0.019676,0.0,0.006259,0.20797,0.126601,2.732169,0.0,9
388,0.996497,0.0,0.0,0.0,0.0,1.986045,0.104063,0.373939,0.0,1.079655,...,1.463654,0.011812,0.018215,0.0,0.015416,0.831652,15.155327,2.519985,0.0,9
389,0.014096,0.0,0.010839,0.024752,0.0,0.005541,0.0,0.674804,0.352952,0.0,...,6.376276,1.437224,3.007647,0.224449,1.001146,0.0,7.96563,0.058068,0.029271,9
390,0.0,0.0,0.451854,0.0,0.0,0.0,0.0,0.073286,0.005137,0.169723,...,10.010357,5.72706,2.24719,0.0,0.542747,0.0,0.027258,0.0,0.0,9


## Analyzing Clusters

In [16]:
features_scaled = pd.DataFrame(features_scaled, index = features.index)
features_scaled['Cluster'] = cluster_labels
features_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,Cluster
386,0.042307,0.000825,0.0,0.0,0.0,0.041516,0.016866,0.244034,0.0,0.0,...,0.723241,0.0,0.051423,0.0,0.000212,0.020329,0.62271,0.019529,0.098353,9
387,0.0,0.0,0.044084,0.0,0.004544,0.059956,0.0,0.15731,0.0,0.021598,...,0.755446,0.119014,0.001412,0.0,0.000362,0.021061,0.008354,0.342074,0.0,9
388,0.061396,0.0,0.0,0.0,0.0,0.161017,0.008987,0.04062,0.0,0.086042,...,0.105504,0.001506,0.001307,0.0,0.000892,0.08422,1.0,0.315508,0.0,9
389,0.000868,0.0,0.00112,0.001786,0.0,0.000449,0.0,0.073302,0.039405,0.0,...,0.45962,0.183193,0.215839,0.013702,0.057903,0.0,0.525599,0.00727,0.004379,9
390,0.0,0.0,0.04671,0.0,0.0,0.0,0.0,0.007961,0.000574,0.013526,...,0.721575,0.729987,0.161266,0.0,0.031391,0.0,0.001799,0.0,0.0,9


In [17]:
features_names = features.copy()

In [18]:
features_names['name'] = face_embeddings['name']

In [19]:
people_clusters = features_names[['name', 'Cluster']].copy()

In [20]:
people_clusters['qtd'] = cluster_labels

In [21]:
qtd_clusters = people_clusters.groupby(['Cluster', 'name']).count()
qtd_clusters

Unnamed: 0_level_0,Unnamed: 1_level_0,qtd
Cluster,name,Unnamed: 2_level_1
0,Jean_Chretien,26
1,Paul_Bremer,1
1,Vicente_Fox,30
2,Jeremy_Greenstock,10
3,Paul_Bremer,19
4,Tang_Jiaxuan,11
4,Wen_Jiabao,13
5,Jackie_Chan,13
6,Adrien_Brody,12
7,Robert_Blake,7


In [22]:
qtd_clusters = people_clusters.groupby(['name','Cluster']).count()
qtd_clusters

Unnamed: 0_level_0,Unnamed: 1_level_0,qtd
name,Cluster,Unnamed: 2_level_1
Adrien_Brody,6,12
Ali_Naimi,11,8
Ben_Affleck,8,1
Ben_Affleck,12,6
Bill_Frist,9,9
Billy_Crystal,8,6
Binyamin_Ben-Eliezer,13,7
Geoff_Hoon,10,7
JK_Rowling,8,6
Jackie_Chan,5,13


In [23]:
features_scaled.groupby('Cluster', axis = 0).std().transpose().describe()

Cluster,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0
mean,0.075077,0.085706,0.086213,0.089287,0.094913,0.094889,0.096565,0.087747,0.156417,0.103614,0.089763,0.081399,0.094182,0.073099,0.096576,0.083895,0.087451,0.096286,0.088654,0.095818
std,0.064176,0.069482,0.077258,0.076016,0.084965,0.081532,0.088814,0.089087,0.092165,0.093998,0.085016,0.083678,0.090157,0.07588,0.08771,0.08042,0.073838,0.075967,0.080646,0.093495
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.021479,0.026932,0.021096,0.023572,0.027874,0.027847,0.020433,0.018037,0.082636,0.023139,0.019811,0.013225,0.019759,0.011487,0.021116,0.017916,0.02619,0.031524,0.021484,0.017681
50%,0.060713,0.069986,0.066261,0.067528,0.070054,0.073781,0.072507,0.058515,0.145259,0.07666,0.064522,0.054271,0.066573,0.048108,0.073713,0.05953,0.069655,0.078449,0.066059,0.066511
75%,0.113952,0.132354,0.133951,0.140414,0.142686,0.142934,0.152884,0.130154,0.228097,0.1628,0.141038,0.125584,0.14246,0.112794,0.151535,0.127352,0.135968,0.150061,0.139399,0.152858
max,0.336748,0.292303,0.353269,0.337921,0.438624,0.377588,0.417217,0.441791,0.415921,0.397289,0.397573,0.394465,0.477907,0.377505,0.390714,0.431901,0.352626,0.322045,0.371827,0.432683
