In [8]:
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
import os
import json
import seaborn as sn
import matplotlib.pyplot as plt
import pandas as pd
import create_directory

# Load Data

In [9]:
data_path = os.path.join(create_directory.marvel_data_dir, "distance_face_recognition/fixed_threshold")

In [10]:
# label2idx
with open(os.path.join(create_directory.recognition_model_dir, 'label2idx.json')) as json_file:
    label2idx = json.load(json_file)

# Big data
with open(os.path.join(data_path, "train_feature_big.json"), "r") as outfile:
    x_train_big = json.load(outfile)
with open(os.path.join(data_path, "test_feature_big.json"), "r") as outfile:
    x_test_big = json.load(outfile)
with open(os.path.join(data_path, "train_label_idx_big.json"), "r") as outfile:
    y_train_big = json.load(outfile)
with open(os.path.join(data_path, "test_label_idx_big.json"), "r") as outfile:
    y_test_big = json.load(outfile)

# Small data
with open(os.path.join(data_path, "train_feature_small.json"), "r") as outfile:
    x_train_small = json.load(outfile)
with open(os.path.join(data_path, "test_feature_small.json"), "r") as outfile:
    x_test_small = json.load(outfile)
with open(os.path.join(data_path, "train_label_idx_small.json"), "r") as outfile:
    y_train_small = json.load(outfile)
with open(os.path.join(data_path, "test_label_idx_small.json"), "r") as outfile:
    y_test_small = json.load(outfile)

# Face recognition

## Cài đặt hàm

In [11]:
# Các hàm nhận diện khuôn mặt
def nearest_recognition(feat, train_labels, train_feats, metric, threshold):
    candidate_label = []
    candidate_score = []
    for label, vector in zip(train_labels, train_feats):
        if metric=='euclidean_distance':
            metric_score = euclidean_distances(np.array(feat).reshape(1, -1), np.array(vector).reshape(1, -1))[0][0]
            if metric_score<threshold:
                candidate_label.append(label)
                candidate_score.append(metric_score)
        elif metric=="cosine_similarity":
            metric_score = cosine_similarity(np.array(feat).reshape(1, -1), np.array(vector).reshape(1, -1))[0][0]
            if metric_score>threshold:
                candidate_label.append(label)
                candidate_score.append(metric_score)
    if len(candidate_label)==0:
        return label2idx["Unknown"]
    else:
        if metric=='euclidean_distance':
            return candidate_label[candidate_score.index(min(candidate_score))]
        if metric=='cosine_similarity':
            return candidate_label[candidate_score.index(max(candidate_score))]

def naive_neighbour_recognition(feat, train_labels, train_feats, metric, threshold):
    candidate_label = []
    candidate_score = []
    for label, vector in zip(train_labels, train_feats):
        if metric == "euclidean_distance":
            metric_score = euclidean_distances(np.array(feat).reshape(1, -1), np.array(vector).reshape(1, -1))[0][0]
            if metric_score<threshold:
                candidate_label.append(label)
                candidate_score.append(metric_score)
        elif metric=="cosine_similarity":
            metric_score = cosine_similarity(np.array(feat).reshape(1, -1), np.array(vector).reshape(1, -1))[0][0]
            if metric_score>threshold:
                candidate_label.append(label)
                candidate_score.append(metric_score)
    if len(candidate_label)==0:
        return label2idx["Unknown"]
    else:
        df_candidate=pd.DataFrame({'Label': candidate_label})
        df_candidate_count = pd.DataFrame(df_candidate["Label"].value_counts().to_dict().items(), columns=['Label','Count'])
        list_candidate = list(df_candidate_count[df_candidate_count['Count']==max(df_candidate_count['Count'])]['Label'])
        if len(list_candidate) == 1:
            return max(candidate_label, key = candidate_label.count)
        else:
            df_candidate_score=pd.DataFrame({'Label': candidate_label, 'Score': candidate_score})
            best_candidate = df_candidate_score[df_candidate_score['Label'].isin(list_candidate)].groupby('Label').mean().to_dict()['Score']
            if metric=='euclidean_distance':
                return min(best_candidate, key = best_candidate.get)
            elif metric=='cosine_similarity':
                return max(best_candidate, key = best_candidate.get)

def weight_neighbour_recognition(feat, train_labels, train_feats, metric, threshold):
    candidate_label = []
    candidate_score = []
    for label, vector in zip(train_labels, train_feats):
        if metric=='euclidean_distance':
            score = euclidean_distances(np.array(feat).reshape(1, -1), np.array(vector).reshape(1, -1))[0][0]
            if score<threshold:
                candidate_label.append(label)
                candidate_score.append(score)
        elif metric=='cosine_similarity':
            score = cosine_similarity(np.array(feat).reshape(1, -1), np.array(vector).reshape(1, -1))[0][0]
            if score>threshold:
                candidate_label.append(label)
                candidate_score.append(score)
    if len(candidate_label)==0:
        return label2idx["Unknown"]
    else:
        weight = {}
        if metric=='euclidean_distances':
            for i in range(len(candidate_label)):
                can = candidate_label[i]
                dis = candidate_score[i]
                if can not in weight:
                    weight[can]=0
                weight[can]+=1/dis
            return max(weight, key = weight.get)
        elif metric=='cosine_similarity':
            for i in range(len(candidate_label)):
                can = candidate_label[i]
                sim = candidate_score[i]
                if can not in weight:
                    weight[can]={}
                    weight[can]['total']=0
                    weight[can]['count']=1
                weight[can]['total']+=sim
                weight[can]['count']+=1
            mean={}
            for label, info in weight.items():
                mean[label]=info['total']/info['count']
            return max(mean, key = mean.get)

In [12]:
# Lấy kết quả dự đoán ứng với các threshold tương ứng
threshold_list = np.arange(0.1, 1, 0.05)
def test_threshold(threshold_list, x_train, x_test, y_train, type, metric):
    recognition_result = {}
    for i in threshold_list:
        print("Threshold: "+str(i))
        if type=="nearest":
            recognition_result[i] = [nearest_recognition(j, y_train, x_train, metric, i) for j in x_test]
        if type=="naive_neighbour":
            recognition_result[i] = [naive_neighbour_recognition(j, y_train, x_train, metric, i) for j in x_test]
        if type=="weight_neighbour":
            recognition_result[i] = [weight_neighbour_recognition(j, y_train, x_train, metric, i) for j in x_test]
    return recognition_result

def evaluation(true, pred):
    fa = 0  # False accept
    wa = 0  # Wrong answer
    fr = 0  # False reject
    accept = 0
    reject = 0

    for (i, j) in zip(true, pred):
        # Hệ thống nhận diện khuôn mặt đó có trong database
        if j != label2idx["Unknown"]:
            accept+=1
            # Hệ thống nhận diện khuôn mặt Unknown thành khuôn mặt trong database
            if i == label2idx["Unknown"]:
                fa+=1
            else:
                # Hệ thống nhận diện nhầm khuôn mặt trong database
                if i!=j:
                    wa+=1
        else:
            reject+=1
            if i != label2idx["Unknown"]:
                fr+=1
    # Mong muốn giảm fa, wa
    return (fa, wa, fr, accept, reject)

In [13]:
# Plot confusion matrix
idx2label = {y: x for x, y in label2idx.items()}
def show_result(result, y_test, threshold):
    pred = result[threshold]
    fa, wa, fr, accept, reject = evaluation(y_test, pred)
    acc = accuracy_score(y_test, pred)
    print("Threshold = "+str(threshold))
    print("Accuracy: {}, Accept: {}, False Accept: {}, Wrong Recognition: {}, Reject: {}, False Reject: {}".format(acc, accept, fa, wa, reject, fr))
    print("")
    array = confusion_matrix(y_test, pred)
    df_cm = pd.DataFrame(array, index = [idx2label[i] for i in range(7)], 
                        columns = [idx2label[i] for i in range(7)])
    sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size
    plt.title("Confusion matrix with threshold = "+str(threshold))
    plt.show()

# Save result
def save_result(result, y_test, save_path):
    thresh_list, fa_list, wa_list, fr_list, accept_list, reject_list, accuracy_list = [], [], [], [], [], [], []
    for threshold, pred in result.items():
        fa, wa, fr, accept, reject = evaluation(y_test, pred)
        acc = accuracy_score(y_test, pred)
        thresh_list.append(threshold)
        fa_list.append(fa)
        wa_list.append(wa)
        fr_list.append(fr)
        accept_list.append(accept)
        reject_list.append(reject)
        accuracy_list.append(acc)
    dict = {'threshold': thresh_list, 'accept': accept_list, 'fa': fa_list, 'wa': wa_list, 
            'reject': reject_list, 'fr': fr_list, 'accuracy': accuracy_list} 
    df = pd.DataFrame(dict)
    df.to_csv(save_path, index=False)

## Euclidean distance (e)

In [8]:
save_path = os.path.join(create_directory.result_dir, 'test_threshold/fixed_threshold/euclidean_distance')
if not os.path.exists(save_path):
    os.makedirs(save_path)

### Big dataset

#### Nearest candidate

In [9]:
big_nearest_e_recognition_result = test_threshold(threshold_list, x_train_big, x_test_big, y_train_big, 
                                                  type='nearest', metric='euclidean_distance')

Threshold: 0.1
Threshold: 0.15000000000000002
Threshold: 0.20000000000000004
Threshold: 0.25000000000000006
Threshold: 0.30000000000000004
Threshold: 0.3500000000000001
Threshold: 0.40000000000000013
Threshold: 0.45000000000000007
Threshold: 0.5000000000000001
Threshold: 0.5500000000000002
Threshold: 0.6000000000000002
Threshold: 0.6500000000000001
Threshold: 0.7000000000000002
Threshold: 0.7500000000000002
Threshold: 0.8000000000000002
Threshold: 0.8500000000000002
Threshold: 0.9000000000000002
Threshold: 0.9500000000000003


In [10]:
save_result(big_nearest_e_recognition_result, y_test_big, 
            os.path.join(save_path, "big_nearest_candidate_threshold_test.csv"))

#### Naive Neighbour candidates

In [11]:
big_naive_neighbour_e_recognition_result = test_threshold(threshold_list, x_train_big, x_test_big, y_train_big, 
                                                          type='naive_neighbour', metric='euclidean_distance')

Threshold: 0.1
Threshold: 0.15000000000000002
Threshold: 0.20000000000000004
Threshold: 0.25000000000000006
Threshold: 0.30000000000000004
Threshold: 0.3500000000000001
Threshold: 0.40000000000000013
Threshold: 0.45000000000000007
Threshold: 0.5000000000000001
Threshold: 0.5500000000000002
Threshold: 0.6000000000000002
Threshold: 0.6500000000000001
Threshold: 0.7000000000000002
Threshold: 0.7500000000000002
Threshold: 0.8000000000000002
Threshold: 0.8500000000000002
Threshold: 0.9000000000000002
Threshold: 0.9500000000000003


In [12]:
save_result(big_naive_neighbour_e_recognition_result, y_test_big, 
            os.path.join(save_path, "big_naive_neighbour_candidate_threshold_test.csv"))

#### Weight Neighbour candidates

In [13]:
big_weight_e_neighbour_recognition_result = test_threshold(threshold_list, x_train_big, x_test_big, y_train_big, 
                                                           metric='euclidean_distance', type='weight_neighbour')

Threshold: 0.1
Threshold: 0.15000000000000002
Threshold: 0.20000000000000004
Threshold: 0.25000000000000006
Threshold: 0.30000000000000004
Threshold: 0.3500000000000001
Threshold: 0.40000000000000013
Threshold: 0.45000000000000007
Threshold: 0.5000000000000001
Threshold: 0.5500000000000002
Threshold: 0.6000000000000002
Threshold: 0.6500000000000001
Threshold: 0.7000000000000002
Threshold: 0.7500000000000002
Threshold: 0.8000000000000002
Threshold: 0.8500000000000002
Threshold: 0.9000000000000002
Threshold: 0.9500000000000003


In [14]:
save_result(big_weight_e_neighbour_recognition_result, y_test_big, 
            os.path.join(save_path, "big_weight_neighbour_candidate_threshold_test.csv"))

### Small dataset

#### Nearest candidate

In [15]:
small_nearest_e_recognition_result = test_threshold(threshold_list, x_train_small, x_test_small, y_train_small, 
                                                    type='nearest', metric='euclidean_distance')

Threshold: 0.1
Threshold: 0.15000000000000002
Threshold: 0.20000000000000004
Threshold: 0.25000000000000006
Threshold: 0.30000000000000004
Threshold: 0.3500000000000001
Threshold: 0.40000000000000013
Threshold: 0.45000000000000007
Threshold: 0.5000000000000001
Threshold: 0.5500000000000002
Threshold: 0.6000000000000002
Threshold: 0.6500000000000001
Threshold: 0.7000000000000002
Threshold: 0.7500000000000002
Threshold: 0.8000000000000002
Threshold: 0.8500000000000002
Threshold: 0.9000000000000002
Threshold: 0.9500000000000003


In [16]:
save_result(small_nearest_e_recognition_result, y_test_small, 
            os.path.join(save_path, "small_nearest_candidate_threshold_test.csv"))

#### Naive Neighbour candidates

In [17]:
small_naive_neighbour_e_recognition_result = test_threshold(threshold_list, x_train_small, x_test_small, y_train_small, 
                                                            type='naive_neighbour', metric='euclidean_distance')

Threshold: 0.1
Threshold: 0.15000000000000002
Threshold: 0.20000000000000004
Threshold: 0.25000000000000006
Threshold: 0.30000000000000004
Threshold: 0.3500000000000001
Threshold: 0.40000000000000013
Threshold: 0.45000000000000007
Threshold: 0.5000000000000001
Threshold: 0.5500000000000002
Threshold: 0.6000000000000002
Threshold: 0.6500000000000001
Threshold: 0.7000000000000002
Threshold: 0.7500000000000002
Threshold: 0.8000000000000002
Threshold: 0.8500000000000002
Threshold: 0.9000000000000002
Threshold: 0.9500000000000003


In [18]:
save_result(small_naive_neighbour_e_recognition_result, y_test_small, 
            os.path.join(save_path, "small_naive_neighbour_candidate_threshold_test.csv"))

### Weight Neighbour candidates

In [19]:
small_weight_e_neighbour_recognition_result = test_threshold(threshold_list, x_train_small, x_test_small, y_train_small, 
                                                             metric='euclidean_distance', type='weight_neighbour')

Threshold: 0.1
Threshold: 0.15000000000000002
Threshold: 0.20000000000000004
Threshold: 0.25000000000000006
Threshold: 0.30000000000000004
Threshold: 0.3500000000000001
Threshold: 0.40000000000000013
Threshold: 0.45000000000000007
Threshold: 0.5000000000000001
Threshold: 0.5500000000000002
Threshold: 0.6000000000000002
Threshold: 0.6500000000000001
Threshold: 0.7000000000000002
Threshold: 0.7500000000000002
Threshold: 0.8000000000000002
Threshold: 0.8500000000000002
Threshold: 0.9000000000000002
Threshold: 0.9500000000000003


In [20]:
save_result(small_weight_e_neighbour_recognition_result, y_test_small, 
            os.path.join(save_path, "small_weight_neighbour_candidate_threshold_test.csv"))

## Cosnine similarity (c)

In [14]:
save_path = os.path.join(create_directory.result_dir, 'test_threshold/fixed_threshold/cosine_similarity')
if not os.path.exists(save_path):
    os.makedirs(save_path)

### Big Dataset

#### Nearest candidate

In [22]:
big_nearest_c_recognition_result = test_threshold(threshold_list, x_train_big, x_test_big, y_train_big, 
                                                  type='nearest', metric='cosine_similarity')

Threshold: 0.1
Threshold: 0.15000000000000002
Threshold: 0.20000000000000004
Threshold: 0.25000000000000006
Threshold: 0.30000000000000004
Threshold: 0.3500000000000001
Threshold: 0.40000000000000013
Threshold: 0.45000000000000007
Threshold: 0.5000000000000001
Threshold: 0.5500000000000002
Threshold: 0.6000000000000002
Threshold: 0.6500000000000001
Threshold: 0.7000000000000002
Threshold: 0.7500000000000002
Threshold: 0.8000000000000002
Threshold: 0.8500000000000002
Threshold: 0.9000000000000002
Threshold: 0.9500000000000003


In [23]:
save_result(big_nearest_c_recognition_result, y_test_big, 
            os.path.join(save_path, "big_nearest_candidate_threshold_test.csv"))

#### Naive neighbour candidate

In [24]:
big_naive_neighbour_c_recognition_result = test_threshold(threshold_list, x_train_big, x_test_big, y_train_big, 
                                                          type='naive_neighbour', metric='cosine_similarity')

Threshold: 0.1
Threshold: 0.15000000000000002
Threshold: 0.20000000000000004
Threshold: 0.25000000000000006
Threshold: 0.30000000000000004
Threshold: 0.3500000000000001
Threshold: 0.40000000000000013
Threshold: 0.45000000000000007
Threshold: 0.5000000000000001
Threshold: 0.5500000000000002
Threshold: 0.6000000000000002
Threshold: 0.6500000000000001
Threshold: 0.7000000000000002
Threshold: 0.7500000000000002
Threshold: 0.8000000000000002
Threshold: 0.8500000000000002
Threshold: 0.9000000000000002
Threshold: 0.9500000000000003


In [25]:
save_result(big_naive_neighbour_c_recognition_result, y_test_big, 
            os.path.join(save_path, "big_naive_neighbour_candidate_threshold_test.csv"))

#### Weight Neighbour candidates

In [15]:
big_weight_c_neighbour_recognition_result = test_threshold(threshold_list, x_train_big, x_test_big, y_train_big, 
                                                          metric='cosine_similarity', type='weight_neighbour')

Threshold: 0.1
Threshold: 0.15000000000000002
Threshold: 0.20000000000000004
Threshold: 0.25000000000000006
Threshold: 0.30000000000000004
Threshold: 0.3500000000000001
Threshold: 0.40000000000000013
Threshold: 0.45000000000000007
Threshold: 0.5000000000000001
Threshold: 0.5500000000000002
Threshold: 0.6000000000000002
Threshold: 0.6500000000000001
Threshold: 0.7000000000000002
Threshold: 0.7500000000000002
Threshold: 0.8000000000000002
Threshold: 0.8500000000000002
Threshold: 0.9000000000000002
Threshold: 0.9500000000000003


In [16]:
save_result(big_weight_c_neighbour_recognition_result, y_test_big, 
            os.path.join(save_path, "big_weight_neighbour_candidate_threshold_test.csv"))

### Small dataset

#### Nearest candidate

In [26]:
small_nearest_c_recognition_result = test_threshold(threshold_list, x_train_small, x_test_small, y_train_small, 
                                                    type='nearest', metric='cosine_similarity')

Threshold: 0.1
Threshold: 0.15000000000000002
Threshold: 0.20000000000000004
Threshold: 0.25000000000000006
Threshold: 0.30000000000000004
Threshold: 0.3500000000000001
Threshold: 0.40000000000000013
Threshold: 0.45000000000000007
Threshold: 0.5000000000000001
Threshold: 0.5500000000000002
Threshold: 0.6000000000000002
Threshold: 0.6500000000000001
Threshold: 0.7000000000000002
Threshold: 0.7500000000000002
Threshold: 0.8000000000000002
Threshold: 0.8500000000000002
Threshold: 0.9000000000000002
Threshold: 0.9500000000000003


In [27]:
save_result(small_nearest_c_recognition_result, y_test_small, 
            os.path.join(save_path, "small_nearest_candidate_threshold_test.csv"))

#### Naive neighbour candidate

In [28]:
small_naive_neighbour_c_recognition_result = test_threshold(threshold_list, x_train_small, x_test_small, y_train_small, 
                                                          type='naive_neighbour', metric='cosine_similarity')

Threshold: 0.1
Threshold: 0.15000000000000002
Threshold: 0.20000000000000004
Threshold: 0.25000000000000006
Threshold: 0.30000000000000004
Threshold: 0.3500000000000001
Threshold: 0.40000000000000013
Threshold: 0.45000000000000007
Threshold: 0.5000000000000001
Threshold: 0.5500000000000002
Threshold: 0.6000000000000002
Threshold: 0.6500000000000001
Threshold: 0.7000000000000002
Threshold: 0.7500000000000002
Threshold: 0.8000000000000002
Threshold: 0.8500000000000002
Threshold: 0.9000000000000002
Threshold: 0.9500000000000003


In [29]:
save_result(small_naive_neighbour_c_recognition_result, y_test_small, 
            os.path.join(save_path, "small_naive_neighbour_candidate_threshold_test.csv"))

#### Weight Neighbour Candidate

In [17]:
small_weight_neighbour_c_recognition_result = test_threshold(threshold_list, x_train_small, x_test_small, y_train_small, 
                                                             metric='cosine_similarity', type='weight_neighbour')

Threshold: 0.1
Threshold: 0.15000000000000002
Threshold: 0.20000000000000004
Threshold: 0.25000000000000006
Threshold: 0.30000000000000004
Threshold: 0.3500000000000001
Threshold: 0.40000000000000013
Threshold: 0.45000000000000007
Threshold: 0.5000000000000001
Threshold: 0.5500000000000002
Threshold: 0.6000000000000002
Threshold: 0.6500000000000001
Threshold: 0.7000000000000002
Threshold: 0.7500000000000002
Threshold: 0.8000000000000002
Threshold: 0.8500000000000002
Threshold: 0.9000000000000002
Threshold: 0.9500000000000003


In [18]:
save_result(small_weight_neighbour_c_recognition_result, y_test_small, 
            os.path.join(save_path, "small_weight_neighbour_candidate_threshold_test.csv"))