In [3]:
from pathlib import Path
import pandas as pd
import tarfile

## **CUB 200 2011**

In [2]:

# raw_data = Path.home() / 'fgvc' / 'data' / 'CUB_200_2011.tgz'
# tar = tarfile.open(raw_data)
# tar.extractall()
# tar.close()

WindowsPath('C:/Users/Mario')

In [9]:
# read each meta data txt
classes = pd.read_csv(Path.cwd().parent / 'CUB_200_2011' / 'classes.txt', sep = ' ', names = ['Category', 'ClassName'])
labels = pd.read_csv(Path.cwd().parent / 'CUB_200_2011' / 'image_class_labels.txt', sep = ' ', names = ['ImageID', 'Category'])
images = pd.read_csv(Path.cwd().parent / 'CUB_200_2011' / 'images.txt', sep = ' ', names = ['ImageID', 'filename'])
splitting = pd.read_csv(Path.cwd().parent / 'CUB_200_2011' / 'train_test_split.txt', sep = ' ', names = ['ImageID', 'is_training'])

# join for full data
data = classes.merge(labels, on = 'Category')
data = data.merge(splitting, on = 'ImageID')
data = data.merge(classes, how = 'left', on = 'Category')
data['Label'] = data.Category - 1

# split to train and test set
train = data.loc[data['is_training'] == 1]
test = data.loc[data['is_training'] == 0]
print(train.shape, test.shape)

# save for csv file
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

## **FGVC Aircraft**

In [None]:
## Run it for 1 time to extract tar

# raw_data = Path.home() / 'fgvc' / 'data' / 'fgvc-aircraft-2013b.tar.gz'
# tar = tarfile.open(raw_data)
# tar.extractall(path = path)
# tar.close()

In [None]:
Classes = pd.read_csv('fgvc-aircraft-2013b/data/variants.txt', names = ['Classes'])

print(Classes.shape)
name = Classes.Classes.values
names = dict([*enumerate(name)])
names = {v: k for k, v in names.items()}
names

In [None]:
train = pd.read_csv('fgvc-aircraft-2013b/data/images_variant_train.txt', names = ['oneline'])

train['filename'] = train.oneline.apply(lambda x: x[:7] + '.jpg')
train['Classes'] = train.oneline.apply(lambda x: x[8:])
train['Labels'] = train.Classes.map(names)
train.drop('oneline', axis=1, inplace=True)
train.head()

In [None]:
val = pd.read_csv('fgvc-aircraft-2013b/data/images_variant_val.txt', names = ['oneline'])

val['filename'] = val.oneline.apply(lambda x: x[:7] + '.jpg')
val['Classes'] = val.oneline.apply(lambda x: x[8:])
val['Labels'] = val.Classes.map(names)
val.drop('oneline', axis=1, inplace=True)
val.head()

In [None]:
test = pd.read_csv('fgvc-aircraft-2013b/data/images_variant_test.txt', names = ['oneline'])

test['filename'] = test.oneline.apply(lambda x: x[:7] + '.jpg')
test['Classes'] = test.oneline.apply(lambda x: x[8:])
test['Labels'] = test.Classes.map(names)
test.drop('oneline', axis=1, inplace=True)
test.head()

In [None]:
trainset = pd.concat([train, val])
trainset = trainset.sort_values(by=['Labels']).reset_index()
trainset.drop(['index'], axis=1, inplace=True)

In [None]:
print(trainset.shape, test.shape)

In [None]:
trainset.to_csv('train_v2.csv', index=False)
test.to_csv('test.csv', index=False)

## **Stanford Cars**

In [None]:
path = 'devkit'
dirs = os.listdir(path)
print(dirs)

In [None]:
# Default variables
annotation = defaultdict()

# list annotation
for filename in dirs:
    if filename[-4:] == '.mat':
        annotation[filename[:-4]] = scipy.io.loadmat(os.path.join(path, filename))
#         print(annotation[filename[:-4]].shape)

In [None]:
def get_labels(cars_meta, train=True):
    
    if train:
        annotations = annotation['cars_train_annos']['annotations'][0]
    else:
        annotations = annotation['cars_test_annos_withlabels']['annotations'][0]
        
    classes = annotation[cars_meta]['class_names'][0]
    class_names = dict(zip(range(1, len(classes)),[c[0] for c in classes]))
    
    labelled_images = {}
    dataset = []
    for i,arr in enumerate(annotations):
        # the last entry in the row is the image name
        # The rest is the data, first bbox, then classid
        dataset.append([y[0][0] for y in arr][0:5]+[arr[5][0]])
    # Convert to a DataFrame, and specify the column names
    temp_df = pd.DataFrame(dataset, 
                           columns =['BBOX_X1','BBOX_Y1','BBOX_X2','BBOX_Y2','ClassID','filename'])

    temp_df = temp_df.assign(ClassName = temp_df.ClassID.map(dict(class_names)))
    temp_df.columns = ['bbox_x1','bbox_y1','bbox_x2','bbox_y2','Category','filename', 'class_name']
    return temp_df

In [None]:
train_df = get_labels('cars_meta')
train_df['Labels'] = train_df.Category - 1
train_df.to_csv('train.csv', index=False)

test_df = get_labels('cars_meta', train=False)
test_df['Labels'] = test_df.Category - 1
test_df.to_csv('test.csv', index=False)
# # Add missing class name! - 'smart fortwo Convertible 2012'
# train_df.loc[train_df['class_name'].isnull(), 'class_name'] = 'smart fortwo Convertible 2012'
# test_df.loc[test_df['class_name'].isnull(), 'class_name'] = 'smart fortwo Convertible 2012'

# frames = [train_df, test_df]
# labels_df = pd.concat(frames)
# labels_df.reset_index(inplace=True, drop=True)
# labels_df = labels_df[['filename', 'bbox_x1', 'bbox_y1','bbox_x2','bbox_y2',
#                             'class_id', 'class_name','is_test']]

# # adjust the test file names
# labels_df['filename'].loc[labels_df['is_test']==1] = 'test_' + labels_df['filename']

# # Add the cropped file names
# labels_df['filename_cropped'] = labels_df['filename'].copy()
# labels_df['filename_cropped'].loc[labels_df['is_test']==0] = 'cropped_' + labels_df['filename']

# labels_df.to_csv(path + 'labels_with_annos.csv')
# labels_df.head()

In [None]:
print('training set has shape: ', train_df.shape)
train_df.head()

print('test set has shape: ', test_df.shape)
test_df.head()