In [16]:
import os
import sys
import pandas as pd
import argparse
from scipy.io import loadmat
import numpy as np
from sklearn.model_selection import train_test_split
import shutil

sys.path.append('..')
from cooked_datasets.templates.stanfordCars import final_classes, open_ai_classes, templates

In [41]:
save_dir = '../cooked_datasets/data/StanfordCars'
data_dir = '../orignal_datasets/StanfordCars'
data_name = 'StanfordCars'

In [42]:
# load class infomation from the cars_meta.mat file
classes = loadmat(os.path.join(data_dir, 'devkit', 'cars_meta.mat'))
# extract class names and replace spaces and underscores and slashe with hyphens
classes = [a[0].replace(' ', '_').replace('/', '-') for a in classes['class_names'][0]]

In [43]:
# load test set annotations from the cars_test_annos_withlabels.mat file
mapping = loadmat(os.path.join(data_dir, 'cars_test_annos_withlabels.mat'))
# Convert annnotations to a pandas dataframe
df = pd.DataFrame(np.hstack(mapping['annotations']))
df.head()

Unnamed: 0,bbox_x1,bbox_y1,bbox_x2,bbox_y2,class,fname
0,[[30]],[[52]],[[246]],[[147]],[[181]],[00001.jpg]
1,[[100]],[[19]],[[576]],[[203]],[[103]],[00002.jpg]
2,[[51]],[[105]],[[968]],[[659]],[[145]],[00003.jpg]
3,[[67]],[[84]],[[581]],[[407]],[[187]],[00004.jpg]
4,[[140]],[[151]],[[593]],[[339]],[[185]],[00005.jpg]


In [44]:
# Extract test file names
X_test = [k[0] for k in list(df['fname'].to_numpy())]
# Extract test labels
y_test = [int(k[0][0]) for k in list(df['class'].to_numpy())]

In [45]:
# load training set annotations from the cars_train_annos.mat file
mapping = loadmat(os.path.join(data_dir, 'devkit', 'cars_train_annos.mat'))
# Convert annnotations to a pandas dataframe
df = pd.DataFrame(np.hstack(mapping['annotations']))
df.head()

Unnamed: 0,bbox_x1,bbox_y1,bbox_x2,bbox_y2,class,fname
0,[[39]],[[116]],[[569]],[[375]],[[14]],[00001.jpg]
1,[[36]],[[116]],[[868]],[[587]],[[3]],[00002.jpg]
2,[[85]],[[109]],[[601]],[[381]],[[91]],[00003.jpg]
3,[[621]],[[393]],[[1484]],[[1096]],[[134]],[00004.jpg]
4,[[14]],[[36]],[[133]],[[99]],[[106]],[00005.jpg]


In [46]:
# Extract training file names
all_train_files = [k[0] for k in list(df['fname'].to_numpy())]
# Extract training labels
all_train_labels = [int(k[0][0]) for k in list(df['class'].to_numpy())]

In [47]:
# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(all_train_files, all_train_labels, test_size=0.2, random_state=10)

In [50]:
# Create the data directory if it does not exist
for typ in ['train', 'val', 'test']:
    # remove existing directories for train, val and test sets
    shutil.rmtree(os.path.join(data_dir, typ), ignore_errors=True, onerror=None)
    # create class
    for cls in classes:
        # create subdirectories for each class with in train, val and test sets
        os.makedirs(os.path.join(save_dir, typ, cls), exist_ok=True)

In [54]:
# Copy files to the appropriate subdirectories
for typ in ['train', 'val', 'test']:
    if typ == 'train':
        X = X_train
        y = y_train
        main_path = 'cars_train'
    elif typ == 'val':
        X = X_val
        y = y_val
        main_path = 'cars_train'
    elif typ == 'test':
        X = X_test
        y = y_test
        main_path = 'cars_test'

    for i, file in enumerate(X):
        # copy the file to the appropriate subdirectory
        original_path = os.path.join(data_dir, main_path, file)
        new_path = os.path.join(save_dir, typ, classes[y[i]-1], file)
        shutil.copyfile(original_path, new_path)