In [2]:
import json
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import create_directory

# Load dữ liệu

In [14]:
with open(os.path.join(create_directory.recognition_model_dir, 'label2idx.json')) as json_file:
    label2idx = json.load(json_file)

def create_label_feature(feature_path, label2idx, ignore_unknown):
    label_idx = []
    feature = []
    label_feat_dict = {}
    for i in os.listdir(feature_path):
        if i == ".DS_Store":
            continue
        # Ta bỏ qua tất cả các data có nhãn Unknown
        if ignore_unknown==True:
            if i == "Unknown":
                continue
        label_feat_dict[i] = []
        for j in os.listdir(os.path.join(feature_path, i)):
            if j == ".DS_Store":
                continue
            feat=np.load(os.path.join(os.path.join(feature_path, i), j))
            feature.append(feat.reshape(128).tolist())
            label_feat_dict[i].append(feat.reshape(128).tolist()) 
            label_idx.append(label2idx[i])
    return label_idx, feature, label_feat_dict

In [15]:
train_feature_path = os.path.join(create_directory.marvel_data_dir, "feature/Train/hog_openface")
train_label_idx, train_feature, train_label_feat_dict = create_label_feature(train_feature_path, label2idx, ignore_unknown=True)
test_feature_path = os.path.join(create_directory.marvel_data_dir, "feature/Test/hog_openface")
test_label_idx, test_feature, test_label_feat_dict = create_label_feature(test_feature_path, label2idx, ignore_unknown=False)

In [16]:
# Lưu cơ sở dữ liệu cho quá trình nhận diện trong face_recognition.py
data_path = os.path.join(create_directory.marvel_data_dir, "distance_face_recognition/fixed_threshold")
if not os.path.exists(data_path):
    os.makedirs(data_path)
with open(os.path.join(data_path, "fixed_database.json"), "w") as outfile:
    json.dump(train_label_feat_dict, outfile)

# Phân chia dữ liệu

## Fixed Threshold

### Big dataset

In [17]:
# Big dataset: sử dụng cả dữ liệu train và dữ liệu test 
# (dữ liệu train làm tập cơ sở, dữ liệu test làm tập thử nghiệm)
x_train_big = train_feature
x_test_big = test_feature
y_train_big = train_label_idx
y_test_big = test_label_idx

In [18]:
with open(os.path.join(data_path, "train_feature_big.json"), "w") as outfile:
    json.dump(x_train_big, outfile)
with open(os.path.join(data_path, "test_feature_big.json"), "w") as outfile:
    json.dump(x_test_big, outfile)
with open(os.path.join(data_path, "train_label_idx_big.json"), "w") as outfile:
    json.dump(y_train_big, outfile)
with open(os.path.join(data_path, "test_label_idx_big.json"), "w") as outfile:
    json.dump(y_test_big, outfile)

### Small dataset

In [19]:
# Small dataset: chỉ sử dụng dữ liệu train
# (dữ liệu train được chia ra làm 2 tập nhỏ hơn)
x_train_small, x_test_small, y_train_small, y_test_small = train_test_split(train_feature, train_label_idx, test_size=0.33,
                                                    random_state=4, stratify=train_label_idx)  

In [20]:
# Ta lấy ngẫu nhiên 1/3 dữ liệu trong nhãn unknown
unknown_feature_path = os.path.join(create_directory.marvel_data_dir, "feature/Train/hog_openface/Unknown")
unknown_label_idx = []
unknown_feature = []
for j in os.listdir(unknown_feature_path):
    if random.random()>0.33:
        continue
    if j == ".DS_Store":
        continue
    feat=np.load(os.path.join(unknown_feature_path, j))
    unknown_feature.append(feat.reshape(128).tolist())
    unknown_label_idx.append(label2idx["Unknown"])
x_test_small+=unknown_feature
y_test_small+=unknown_label_idx

In [21]:
with open(os.path.join(data_path, "train_feature_small.json"), "w") as outfile:
    json.dump(x_train_small, outfile)
with open(os.path.join(data_path, "test_feature_small.json"), "w") as outfile:
    json.dump(x_test_small, outfile)
with open(os.path.join(data_path, "train_label_idx_small.json"), "w") as outfile:
    json.dump(y_train_small, outfile)
with open(os.path.join(data_path, "test_label_idx_small.json"), "w") as outfile:
    json.dump(y_test_small, outfile)

## Adaptive Threshold

In [10]:
data = {'feature': train_feature, 'label': train_label_idx, 'threshold': [2]*len(train_label_idx)} 
df = pd.DataFrame.from_dict(data)
df["idx"] = df.index
df.head()

Unnamed: 0,feature,label,threshold,idx
0,"[-0.02072661928832531, 0.124546118080616, -0.1...",3,2,0
1,"[-0.01946512795984745, 0.13560181856155396, -0...",3,2,1
2,"[-0.05963575094938278, 0.09309344738721848, 0....",3,2,2
3,"[0.11037980765104294, 0.023455195128917694, -0...",3,2,3
4,"[-0.002607713919132948, 0.12767337262630463, -...",3,2,4


In [11]:
data_path = os.path.join(create_directory.marvel_data_dir, "distance_face_recognition/adaptive_threshold")
if not os.path.exists(data_path):
    os.makedirs(data_path)
df.to_csv(os.path.join(data_path, 'feature_euclidean.csv'), index=False)

In [3]:
df['threshold'] = [0]*len(df)
df.to_csv(os.path.join(data_path, 'feature_cosine.csv'), index=False)

In [36]:
order_data_path = os.path.join(data_path, 'order')
if not os.path.exists(order_data_path):
    os.makedirs(order_data_path)
order = np.arange(len(df)).tolist()
register_order_num = 5
for i in range(register_order_num):
    random.shuffle(order)
    with open(os.path.join(order_data_path, "register_order_{}.json".format(i)), "w") as outfile:
        json.dump(order, outfile)