## Reading data

In [1]:
import pandas as pd

# read dataset
df = pd.read_csv('../data/train-face-encodings.csv')
# tdf = pd.read_csv('../data/test-face-encodings.csv')

## Exploring Data

### Viewing the head

In [2]:
df.head()

Unnamed: 0,Case#,name,fe1,fe2,fe3,fe4,fe5,fe6,fe7,fe8,...,fe119,fe120,fe121,fe122,fe123,fe124,fe125,fe126,fe127,fe128
0,1,Amandla Sternburg,-0.062336,0.052564,0.100087,-0.119492,-0.139727,-0.101675,-0.07795,-0.115687,...,0.077741,0.08712,0.007947,-0.013164,-0.114904,-0.004787,0.123941,-0.13365,0.034758,0.075893
1,2,Amandla Sternburg,-0.062336,0.052564,0.100087,-0.119492,-0.139727,-0.101675,-0.07795,-0.115687,...,0.077741,0.08712,0.007947,-0.013164,-0.114904,-0.004787,0.123941,-0.13365,0.034758,0.075893
2,3,Amandla Sternburg,-0.063223,-0.038476,0.101766,-0.079118,-0.164185,-0.092289,-0.065582,-0.06786,...,0.094934,0.092719,-0.031205,-0.032641,-0.135209,0.025838,0.150927,-0.089295,0.035543,0.064832
3,4,Amandla Sternburg,-0.09861,0.056209,0.136003,-0.118633,-0.17205,-0.055944,-0.084016,-0.072944,...,0.14301,0.093671,-0.071815,-0.059864,-0.094761,-0.037739,0.121683,-0.100972,0.065145,0.053635
4,5,Amandla Sternburg,-0.054156,0.044665,0.134833,-0.108199,-0.160031,-0.043255,-0.099803,-0.050705,...,0.128864,0.1246,-0.008052,-0.025404,-0.129545,-0.033177,0.140403,-0.112612,0.062427,0.071699


### Obtaining data shape

In [3]:
print(df.shape)

(123, 130)


### Determining number of classes

In [4]:
no_classes = len(df.name.unique())
print(no_classes)

21


### Determining statistical description

In [5]:
df.describe()

Unnamed: 0,Case#,fe1,fe2,fe3,fe4,fe5,fe6,fe7,fe8,fe9,...,fe119,fe120,fe121,fe122,fe123,fe124,fe125,fe126,fe127,fe128
count,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,...,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0
mean,62.081301,-0.153911,0.093016,0.075728,-0.036742,-0.043479,-0.076555,0.028473,-0.101553,0.16838,...,0.025066,0.090602,-0.039843,-0.058998,-0.126953,-0.022616,0.086011,-0.068244,0.025991,0.03378
std,35.781805,0.044845,0.051656,0.039565,0.048144,0.044359,0.044226,0.044705,0.054191,0.034994,...,0.040881,0.040628,0.047745,0.042062,0.041655,0.041885,0.037275,0.044576,0.03707,0.028331
min,1.0,-0.242679,-0.038476,-0.009747,-0.149475,-0.17205,-0.17229,-0.099803,-0.208459,0.074353,...,-0.080591,0.001783,-0.156417,-0.16518,-0.237033,-0.142511,-0.014813,-0.200636,-0.061938,-0.056826
25%,31.5,-0.185064,0.056718,0.045038,-0.070938,-0.069855,-0.10666,0.005892,-0.139013,0.147295,...,-0.000925,0.058287,-0.074143,-0.086362,-0.153125,-0.049034,0.065787,-0.091746,-0.003708,0.013888
50%,62.0,-0.159923,0.097806,0.071572,-0.042677,-0.040682,-0.077778,0.033414,-0.10984,0.168897,...,0.023494,0.086206,-0.045257,-0.057532,-0.125941,-0.017751,0.085361,-0.066152,0.029479,0.035502
75%,92.5,-0.123469,0.12924,0.103062,0.003893,-0.020213,-0.04684,0.0589,-0.064138,0.193672,...,0.046364,0.121337,-0.006431,-0.029462,-0.097335,0.00361,0.108677,-0.037128,0.048893,0.053609
max,124.0,-0.054156,0.208019,0.178643,0.066089,0.048678,0.059532,0.119882,0.048881,0.261558,...,0.14301,0.167401,0.088005,0.049966,-0.036952,0.087532,0.175817,0.02923,0.114733,0.106698


## Spliting dataset into train and test

In [6]:
from sklearn.model_selection import train_test_split
#split input and output colums
X = df.drop(axis=0, columns=['name', 'Case#'])
y = df.name
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
# X_test = tdf.drop(axis=0, columns=['name', 'Case#'])
# y_test = tdf.name
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)


X_train:  (92, 128)
y_train:  (92,)
X_test:  (31, 128)
y_test:  (31,)


## Training model

In [7]:
# import matplotlib.pyplot as plt
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC #Support Vector Machine

clf = OneVsRestClassifier(SVC(probability=True)).fit(X, y)

## Saving model externally

In [8]:
import pickle
filepath = "../models/classifier1.sav"
pickle.dump(clf, open(filepath, 'wb'))

## Testing model

In [9]:
y_pred = clf.predict(X_test)
print(y_pred)

['Phoebe' 'Usher Raymond' 'Ben Wycliff Mugalu' 'Majo' 'Ann'
 'Juliana Kanyomozi' 'Denzel Washington' 'Ann' 'Liz' 'Priscilla Walaga'
 'Denzel Washington' 'Will Smith' 'Eric' 'Lupita Nyongo'
 'Amandla Sternburg' 'Priscilla Walaga' 'Priscilla Walaga'
 'Denzel Washington' 'Will Smith' 'Edwin Paul Suubi' 'Noreen'
 'Usher Raymond' 'Kevin Hart' 'Ben Wycliff Mugalu' 'Taylor Russell'
 'Amandla Sternburg' 'Ben Wycliff Mugalu' 'Noreen' 'Majo'
 'Denzel Washington' 'Kevin Hart']


## Determining accuracy

In [10]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

1.0


In [11]:
import numpy as np

print(np.mean(y_pred == y_test))
print(y_pred, [i for i in y_test])

1.0
['Phoebe' 'Usher Raymond' 'Ben Wycliff Mugalu' 'Majo' 'Ann'
 'Juliana Kanyomozi' 'Denzel Washington' 'Ann' 'Liz' 'Priscilla Walaga'
 'Denzel Washington' 'Will Smith' 'Eric' 'Lupita Nyongo'
 'Amandla Sternburg' 'Priscilla Walaga' 'Priscilla Walaga'
 'Denzel Washington' 'Will Smith' 'Edwin Paul Suubi' 'Noreen'
 'Usher Raymond' 'Kevin Hart' 'Ben Wycliff Mugalu' 'Taylor Russell'
 'Amandla Sternburg' 'Ben Wycliff Mugalu' 'Noreen' 'Majo'
 'Denzel Washington' 'Kevin Hart'] ['Phoebe', 'Usher Raymond', 'Ben Wycliff Mugalu', 'Majo', 'Ann', 'Juliana Kanyomozi', 'Denzel Washington', 'Ann', 'Liz', 'Priscilla Walaga', 'Denzel Washington', 'Will Smith', 'Eric', 'Lupita Nyongo', 'Amandla Sternburg', 'Priscilla Walaga', 'Priscilla Walaga', 'Denzel Washington', 'Will Smith', 'Edwin Paul Suubi', 'Noreen', 'Usher Raymond', 'Kevin Hart', 'Ben Wycliff Mugalu', 'Taylor Russell', 'Amandla Sternburg', 'Ben Wycliff Mugalu', 'Noreen', 'Majo', 'Denzel Washington', 'Kevin Hart']


In [12]:
print(y_test)

85                 Phoebe
98          Usher Raymond
10     Ben Wycliff Mugalu
71                   Majo
8                     Ann
45      Juliana Kanyomozi
22      Denzel Washington
7                     Ann
61                    Liz
91       Priscilla Walaga
114     Denzel Washington
110            Will Smith
33                   Eric
66          Lupita Nyongo
2       Amandla Sternburg
90       Priscilla Walaga
89       Priscilla Walaga
24      Denzel Washington
122            Will Smith
30       Edwin Paul Suubi
78                 Noreen
100         Usher Raymond
60             Kevin Hart
16     Ben Wycliff Mugalu
97         Taylor Russell
113     Amandla Sternburg
13     Ben Wycliff Mugalu
76                 Noreen
73                   Majo
26      Denzel Washington
56             Kevin Hart
Name: name, dtype: object


In [13]:
# print(df.name)

In [14]:
# print(df.groupby("name").mean())