#### Folder Creation

In [2]:
import numpy as np
import pandas as pd
from os.path import join
import os
import shutil
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt
from facenet_pytorch import MTCNN
import torch

#### Small Training Data

In [3]:
# Takes 25 minutes
# Sort train_small data into subfolders according to class
data_dir = r"C:\Users\jjuus\OneDrive - purdue.edu\ECE 50024\Kaggle"
data_mat = pd.read_csv(join(data_dir, "train_small.csv"))

indices = data_mat['Unnamed: 0']
filenames = data_mat['File Name']
classes = data_mat['Category']

# print(data_mat)
# print(indices[0])
# print(filenames[0])
# print(classes[0])
# print()

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

mtcnn = MTCNN(
    image_size=160, margin=0, min_face_size=20,
    thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=True,
    device=device
)

# Get starting/ending picture directories
train_data_dir = join(data_dir, r"train_small\train_small")
jtc_data_dir = join(data_dir, r"train_small_jtc")
jtc_crop_data_dir = join(data_dir, r"train_small_jtc_cropped")

# Loop through all starting pictures
# len(data_mat)
for lcv in tqdm(range(10)):
    # Get source filepath
    src = join(train_data_dir, filenames[lcv])

    # Get destination filepath
    subfolder_name = classes[lcv].lower().replace(" ", "_")
    dst = join(jtc_data_dir, subfolder_name, filenames[lcv])

    # # Move file (make dir if necessary)
    # os.makedirs(os.path.dirname(dst), exist_ok=True)
    # shutil.copy2(src, dst)

    # Try to crop file
    curr_img = Image.open(src)
    crop_dst = join(jtc_crop_data_dir, subfolder_name, filenames[lcv])
    try:
        curr_img_cropped = mtcnn(curr_img, save_path=crop_dst)
    except:
        try:
            curr_img_cropped = mtcnn(curr_img.convert('RGB'), save_path=crop_dst)
        except:
            print("Still Failed ", filenames[lcv])
            pass

Running on device: cuda:0


100%|██████████| 10/10 [00:02<00:00,  3.72it/s]


In [6]:
print("mtcnn.select_largest: ", mtcnn.select_largest)
print("mtcnn.selection_method: ", mtcnn.selection_method)

mtcnn.select_largest:  True
mtcnn.selection_method:  largest


#### Large Training Data

In [14]:
# Takes 3.8 hours
# Sort train_small data into subfolders according to class
data_dir = r"C:\Users\jjuus\OneDrive - purdue.edu\ECE 50024\Kaggle"
data_mat = pd.read_csv(join(data_dir, "train.csv"))

indices = data_mat['Unnamed: 0']
filenames = data_mat['File Name']
classes = data_mat['Category']

# print(data_mat)
# print(indices[0])
# print(filenames[0])
# print(classes[0])
# print()

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

mtcnn = MTCNN(
    image_size=160, margin=0, min_face_size=20,
    thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=True,
    device=device
)

# Get starting/ending picture directories
train_data_dir = join(data_dir, r"train\train")
jtc_data_dir = join(data_dir, r"train_jtc")
jtc_crop_data_dir = join(data_dir, r"train_jtc_cropped2")

# Loop through all starting pictures
# len(data_mat)
for lcv in tqdm(range(len(data_mat))):
    # Get source filepath
    src = join(train_data_dir, filenames[lcv])

    # Get destination filepath
    subfolder_name = classes[lcv].lower().replace(" ", "_")
    dst = join(jtc_data_dir, subfolder_name, filenames[lcv])

    # # Move file (make dir if necessary)
    # os.makedirs(os.path.dirname(dst), exist_ok=True)
    # shutil.copy2(src, dst)

    # Try to crop file
    curr_img = Image.open(src)
    crop_dst = join(jtc_crop_data_dir, subfolder_name, filenames[lcv])
    try:
        curr_img_cropped = mtcnn(curr_img, save_path=crop_dst)
    except:
        try:
            curr_img_cropped = mtcnn(curr_img.convert('RGB'), save_path=crop_dst)
        except:
            print("Still Failed ", filenames[lcv])
            pass

Running on device: cuda:0


100%|██████████| 69540/69540 [4:05:43<00:00,  4.72it/s]   


#### Testing Data

In [8]:
for lcv in range(10):
    print(str(lcv) + ".jpg")

0.jpg
1.jpg
2.jpg
3.jpg
4.jpg
5.jpg
6.jpg
7.jpg
8.jpg
9.jpg


In [15]:
# Sort train_small data into subfolders according to class
data_dir = r"C:\Users\jjuus\OneDrive - purdue.edu\ECE 50024\Kaggle"

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

mtcnn = MTCNN(
    image_size=160, margin=0, min_face_size=20,
    thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=True,
    device=device
)

# Get starting/ending picture directories
test_data_dir = join(data_dir, r"test\test")
jtc_data_dir = join(data_dir, r"test_jtc")
jtc_crop_data_dir = join(data_dir, r"test_jtc_cropped2")

# Loop through all starting pictures
# 4977
for lcv in tqdm(range(4977)):
    # Get source filepath
    curr_filename = str(lcv) + ".jpg"
    src = join(test_data_dir, curr_filename)

    # Try to crop file
    curr_img = Image.open(src)
    crop_dst = join(jtc_crop_data_dir, curr_filename)
    try:
        curr_img_cropped = mtcnn(curr_img, save_path=crop_dst)
    except:
        try:
            curr_img_cropped = mtcnn(curr_img.convert('RGB'), save_path=crop_dst)
        except:
            print("Still Failed ", filenames[lcv])
            pass

Running on device: cuda:0


100%|██████████| 4977/4977 [17:45<00:00,  4.67it/s]  


#### Missing Data

In [4]:
# Find missing test images
import pandas as pd
from tqdm import tqdm
import os.path
import csv
from os.path import join
import shutil

# Get lookup class csv
data_dir = r"C:\Users\jjuus\OneDrive - purdue.edu\ECE 50024\Kaggle"
my_classes_mat = pd.read_csv(join(data_dir, "category_lookup.csv"))
my_classes_mat
their_name_vec = my_classes_mat['their_name']

test_img_path = r"C:\Users\jjuus\OneDrive - purdue.edu\ECE 50024\Kaggle\test_jtc_cropped3"
original_test_path = r"C:\Users\jjuus\OneDrive - purdue.edu\ECE 50024\Kaggle\test\test"
dst_missing_path = r"C:\Users\jjuus\OneDrive - purdue.edu\ECE 50024\Kaggle\test_jtc_missing3"

missing_list = []
for lcv in range(4977):
    # Get source filepath
    curr_filename = str(lcv) + ".jpg"
    src = join(test_img_path, curr_filename)

    # Test to see if input file exists
    if not os.path.isfile(src):
        print("Missing: ", lcv)
        missing_list.append(lcv)

        # Move file (make dir if necessary)
        orig_src = join(original_test_path, curr_filename)
        dst = join(dst_missing_path, curr_filename)
        os.makedirs(os.path.dirname(dst), exist_ok=True)
        shutil.copy2(orig_src, dst)

len(missing_list)


Missing:  133
Missing:  166
Missing:  210
Missing:  217
Missing:  282
Missing:  289
Missing:  317
Missing:  343
Missing:  430
Missing:  545
Missing:  585
Missing:  609
Missing:  701
Missing:  851
Missing:  927
Missing:  1004
Missing:  1141
Missing:  1154
Missing:  1351
Missing:  1362
Missing:  1480
Missing:  1499
Missing:  1658
Missing:  1734
Missing:  1749
Missing:  1826
Missing:  1868
Missing:  1922
Missing:  1936
Missing:  1984
Missing:  1997
Missing:  2008
Missing:  2388
Missing:  2411
Missing:  2431
Missing:  2523
Missing:  2545
Missing:  2547
Missing:  2581
Missing:  2583
Missing:  2777
Missing:  2780
Missing:  2785
Missing:  2923
Missing:  3010
Missing:  3088
Missing:  3148
Missing:  3209
Missing:  3322
Missing:  3469
Missing:  3558
Missing:  3568
Missing:  3731
Missing:  3744
Missing:  3752
Missing:  3819
Missing:  3842
Missing:  3905
Missing:  3973
Missing:  4237
Missing:  4337
Missing:  4350
Missing:  4390
Missing:  4610
Missing:  4757
Missing:  4780
Missing:  4888
Missing:  

68

#### MTCNN

In [13]:
from facenet_pytorch import MTCNN, InceptionResnetV1, fixed_image_standardization, training
import torch
from torch.utils.data import DataLoader, SubsetRandomSampler
from torch import optim
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms
import numpy as np
import os
from os.path import join

In [21]:
# Sort train_small data into subfolders according to class
data_dir = r"C:\Users\jjuus\OneDrive - purdue.edu\ECE 50024\Kaggle"

# Get ending picture directories
jtc_data_dir = join(data_dir, r"train_small_jtc_test")

data_dir = jtc_data_dir

batch_size = 32
epochs = 8
workers = 0 if os.name == 'nt' else 8

In [22]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

Running on device: cuda:0


In [23]:
mtcnn = MTCNN(
    image_size=160, margin=0, min_face_size=20,
    thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=True,
    device=device
)

In [24]:
dataset = datasets.ImageFolder(data_dir, transform=transforms.Resize((512, 512)))
dataset.samples = [
    (p, p.replace(data_dir, data_dir + '_cropped'))
        for p, _ in dataset.samples
]
        
loader = DataLoader(
    dataset,
    num_workers=workers,
    batch_size=batch_size,
    collate_fn=training.collate_pil
)

for i, (x, y) in enumerate(loader):
    mtcnn(x, save_path=y)
    print('\rBatch {} of {}'.format(i + 1, len(loader)), end='')
    
# Remove mtcnn to reduce GPU memory usage
del mtcnn

Batch 1 of 3

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (32,) + inhomogeneous part.