In [1]:
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
import scipy.io
import numpy as np
import os
import shutil
from tqdm import tqdm

In [2]:
# http://ai.stanford.edu.ezproxy2.utwente.nl/~jkrause/cars/car_dataset.html

In [3]:
def load_dataset(path):
    training_labels = scipy.io.loadmat(path)
    res = [np.array([bbox_x1, bbox_y1, bbox_x2, bbox_y2, label, file_name], dtype='object').reshape(-1) 
           for bbox_x1, bbox_y1, bbox_x2, bbox_y2, label, file_name in training_labels['annotations'][0]]
    return np.asarray(res) # Returns bbox_x1, bbox_y1, bbox_x2, bbox_y2, label, file_name

def load_labels(path):
    mat = scipy.io.loadmat(path)
    class_names = [l[0] for l in mat['class_names'][0,:]]
    # How stupid that the class indices start with 1... 
    # Replace the '/' for when creating structure 
    label_map = {idx+1: name.replace('/', '_') for idx, name in enumerate(class_names)}
    return label_map # {class_index: label}

def lookup_class_idx_by_file_name(dataset, file_name):
    idx = np.where(dataset[:, 5]==file_name) # Do lookup by file_name
    class_idx = dataset[idx, 4]
    return class_idx[0,0]

In [7]:
global_path = '../data/StanfordCars'
training_imgs_path = global_path+'/cars_train'
testing_imgs_path = global_path+'/cars_test'

training_dataset = load_dataset(global_path+'/annotations/cars_train_annos.mat')
testing_dataset = load_dataset(global_path+'/annotations/cars_test_annos_withlabels.mat')
label_map= load_labels(global_path+'/annotations/cars_meta.mat')

In [11]:
# Create structure for loading into PyTorch Dataset model
folder_name = 'pytorch_structured_dataset'
folder_path = os.path.join(global_path, folder_name)
def mkdir_if_not_exists(dirpath):
    if not os.path.exists(dirpath):
        os.mkdir(dirpath)

mkdir_if_not_exists(folder_path)
for subfolder in ['train', 'val']:
    subfolder_path = os.path.join(folder_path, subfolder)
    mkdir_if_not_exists(subfolder_path)
    for label in label_map.values():
        mkdir_if_not_exists(os.path.join(subfolder_path, label))

In [12]:
# Put all the training images into every directory
for img_path in tqdm(os.listdir(training_imgs_path)):
    # print(training_imgs_path+'/'+img_path)
    # Get class by looking up file_name
    img_class_idx = lookup_class_idx_by_file_name(training_dataset, img_path)
    img_class_label = label_map[img_class_idx]
    
    src = training_imgs_path+'/'+img_path
    dst = global_path+'/'+folder_name+'/train/'+label+'/'+img_path
    shutil.copyfile(src, dst)

100%|██████████| 8144/8144 [00:12<00:00, 646.95it/s]


In [None]:
for img_path in tqdm(os.listdir(testing_imgs_path)):
    # print(training_imgs_path+'/'+img_path)
    # Get class by looking up file_name
    img_class_idx = lookup_class_idx_by_file_name(testing_dataset, img_path)
    img_class_label = label_map[img_class_idx]
    
    src = testing_imgs_path+'/'+img_path
    dst = global_path+'/'+folder_name+'/val/'+label+'/'+img_path
    shutil.copyfile(src, dst)

 78%|███████▊  | 6271/8041 [00:09<00:03, 588.27it/s]

In [None]:
# Distributions of the classes
unique, counts = np.unique(training_dataset[:, 4], return_counts=True)
plt.figure(figsize=(10, 7), dpi=90)
plt.bar(range(0, len(counts)), counts)