# Unsupervised
Clustering of faces

## 0) Initial Setup

We start with loading the required packages.

In [2]:
# !pip install tensorflow
import numpy as np
import pandas as pd
import json
import os
from tqdm import tqdm

from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import cluster

from PIL import Image, ImageFilter,ImageEnhance

import tensorflow as tf
import tensorflow.keras as K
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg19 import VGG19


import tensorflow_datasets as tfds
from tensorflow.keras.utils import image_dataset_from_directory

## 1) Set paths to data & models

In [3]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [8]:
# load data
base = '.'
LOADPATH = base + '/data/'
SAVEPATH = base + '/models/'

# Converting labels to np array
cat = [ 'skin_tone','age','gender']
lbs = [LabelBinarizer() for i in range(len(cat))]
length = width = 224

In [130]:
# load labels data
TESTPATH = base + '/data/test/'
df_test = pd.read_csv(base+'/test_labels.csv')

# Convert labels to np array
print("Converting test labels to np array")
testY = []
for i in range(len(cat)):
    lab = lbs[i].fit_transform(df_test[cat[i]])
    if lab.shape[1]==1:
        testY.append(np.hstack((1-lab,lab)))
    else:
        testY.append(lab)
        
# load and convert images into np array
print("Loading test images")
nt = df_test.shape[0]
all_imgs = [image.load_img(TESTPATH+df_test.iloc[i]['name'], target_size=(length,width)) for i in range(nt)]

print("Converting test images to np array")
testX = np.empty([nt, length, width, 3], dtype=float)
for i in range(nt):
    testX[i,:] = image.img_to_array(all_imgs[i])
testX = K.applications.vgg19.preprocess_input(testX)

Converting test labels to np array
Loading test images
Converting test images to np array


## 2) Load Encoder

In [14]:
# function to initialize a VGG19
def prepModel():
    v19_model = VGG19(include_top=False, weights='imagenet', input_tensor=K.Input(shape=[length,width,3]))

    for layer in v19_model.layers:
        layer.trainable = False
    model = K.models.Sequential()
    model.add(v19_model)
    model.add(K.layers.Flatten())
    return model

prep_model = prepModel()

# 3) Unsupervised clustering

In [15]:
# Encode Data
X = prep_model.predict(X)



In [17]:
X.shape

(8488, 25088)

In [23]:
spectral = cluster.SpectralClustering(
        n_clusters=10,
        eigen_solver="arpack",
        affinity="nearest_neighbors",
    ).fit(X)

In [25]:
pd.DataFrame(spectral.labels_).value_counts()

1    3731
2    1845
8    1326
6     414
3     367
9     271
7     254
0     225
5      42
4      13
dtype: int64

In [30]:
df_test['cluster'] = spectral.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [34]:
pd.crosstab(df_test.gender, df_test.cluster)

cluster,0,1,2,3,4,5,6,7,8,9
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
female,36,723,216,38,2,11,13,21,347,91
male,45,369,340,70,0,4,101,49,24,2


In [40]:
pd.crosstab(df_test.skin_tone, df_test.cluster)

cluster,0,1,2,3,4,5,6,7,8,9
skin_tone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
monk_1,5,59,18,4,0,0,8,16,29,6
monk_10,1,17,13,2,0,0,8,0,0,0
monk_2,10,126,75,17,0,1,19,17,68,21
monk_3,12,215,88,16,0,3,13,13,88,20
monk_4,15,198,92,18,1,3,13,9,95,23
monk_5,15,178,75,20,1,4,15,8,45,15
monk_6,8,107,51,15,0,2,8,2,26,4
monk_7,9,97,58,11,0,0,12,3,13,2
monk_8,4,49,53,5,0,0,11,1,6,2
monk_9,2,46,33,0,0,2,7,1,1,0


In [39]:
pd.crosstab(df_test.age, df_test.cluster)

cluster,0,1,2,3,4,5,6,7,8,9
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0_17,3,242,105,13,1,3,30,70,69,7
18_30,45,566,189,36,1,7,47,0,244,79
31_60,31,256,214,49,0,5,34,0,56,7
61_100,2,28,48,10,0,0,3,0,2,0


In [42]:
df_labeled[['name','skin_tone', 'gender', 'age', 'cluster']].to_csv('clusters.csv')