In [9]:
import os
import random
import shutil
import tarfile
import zipfile
import numpy as np
import pandas as pd

In [10]:
path_to_data = 'E:\\Data\\DalalProject\\extract_zips\\camera00.tar'
path_to_annotations = 'E:\\Data\\DalalProject\\extract_zips\\annotations.tar'

In [11]:

path_to_camera_tars = 'camera_zips'
path_to_annotations_file = 'BelgiumTSD_annotations.zip'
path_to_parsed_dataset = 'parsed_dataset'
path_to_extract_data = 'extracted_data'

In [12]:
train_perc = 0.8
validation_perc = 0.1
test_perc = 0.1

classes_ofi = list(range(1, 12))

In [13]:
if not os.path.exists(path_to_extract_data):
    os.makedirs(path_to_extract_data)

if not os.path.exists(path_to_parsed_dataset):
    os.makedirs(path_to_parsed_dataset)


In [14]:
def extract_tar_file(path_to_tar_file, path_to_dst):
    
    if not os.path.exists(path_to_dst):
        os.makedirs(path_to_dst)
    
    if tarfile.is_tarfile(path_to_tar_file):
        with tarfile.open(path_to_tar_file) as f:
            f.extractall(path=path_to_dst)  

def extract_zip_file(path_to_zip_file, path_to_dst):
    
    if not os.path.exists(path_to_dst):
        os.makedirs(path_to_dst)
        
    with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
        zip_ref.extractall(path_to_dst)

extract_zip_file(path_to_annotations_file, os.path.join(path_to_extract_data, 'annotations'))


In [15]:
path_to_train_anot_file = os.path.join(path_to_extract_data, 'annotations', 'BelgiumTSD_annotations','BTSD_training_GT.txt')
path_to_test_anot_file = os.path.join(path_to_extract_data, 'annotations', 'BelgiumTSD_annotations','BTSD_testing_GT.txt')

In [16]:
camera_tars = list(os.walk(path_to_camera_tars))[0][2]

In [17]:
for tar in camera_tars:
    print('extractings ', tar, '..')
    extract_tar_file(os.path.join(path_to_camera_tars, tar), os.path.join(path_to_extract_data, 'data'))
    

extractings  camera00.tar ..
extractings  camera01.tar ..
extractings  camera02.tar ..
extractings  camera03.tar ..
extractings  camera04.tar ..
extractings  camera05.tar ..
extractings  camera06.tar ..
extractings  camera07.tar ..


In [35]:
def extract_data_from_annotation_file(path_to_annotation_file):
    
    with open(path_to_annotation_file, 'r') as f:
        annotation_data = f.read()


    lines = annotation_data.split('\n')

    file_names = []
    file_coordinates = []
    file_superclass = []
    file_class = []
    file_labels = []
    
    data_dict = {'file_names': [], 
                 'coordinates': [], 
                 'superclasses': [], 
                 'classes': [], 
                 'labels': []}
    

    superclasses_per_file = {}


    for line in lines:
        parts = line.split(';')

        if len(parts) != 13:
            #print('skipping line: ', line)
            continue

        path_to_image = parts[0]
        coordinates = [float(i) for i in parts[1:5]]
        superclass = int(parts[6])
        tclass = int(parts[5])
        label = parts[-2]

        #print('path_to_image: ', path_to_image)
        #print('\tcoordinates: ', coordinates)
        #print('\tlabel: ', label)
        #print('\tsuperclass: ', superclass)

        data_dict['file_names'].append(path_to_image)
        data_dict['coordinates'].append(coordinates)
        data_dict['labels'].append(label)
        data_dict['superclasses'].append(superclass)
        data_dict['classes'].append(tclass)

        if path_to_image not in superclasses_per_file:
            superclasses_per_file[path_to_image] = []

        if superclass not in superclasses_per_file[path_to_image]:
            superclasses_per_file[path_to_image].append(superclass)


    all_train_files = []
    all_validation_files = []
    all_test_files = []

    for class_ofi in classes_ofi:

        rel_image_paths = []
        for im_path in superclasses_per_file:
            if class_ofi in superclasses_per_file[im_path]:
                rel_image_paths.append(im_path)

        random.shuffle(rel_image_paths)

        amount_train = int(len(rel_image_paths) * train_perc)
        amount_validation = int(len(rel_image_paths) * validation_perc)
        amount_test = int(len(rel_image_paths) * test_perc)

        train_files = rel_image_paths[:amount_train]
        validation_files = rel_image_paths[amount_train: amount_train + amount_validation]
        test_files = rel_image_paths[amount_train + amount_validation: ]


        # This code makes sure that the same file type doesn't end up in more than one category.
        for file in validation_files:
            if file not in all_train_files and file not in all_test_files:
                all_validation_files.append(file)


        for file in test_files:
            if file not in all_train_files and file not in all_validation_files:
                all_test_files.append(file)


        for file in train_files:
            if file not in all_test_files and file not in all_validation_files:
                all_train_files.append(file)
    
    return all_train_files, all_validation_files, all_test_files, data_dict

In [36]:
train_all_train_files, train_all_validation_files, train_all_test_files, train_data_dict = extract_data_from_annotation_file(path_to_train_anot_file)
test_all_train_files, test_all_validation_files, test_all_test_files, test_data_dict = extract_data_from_annotation_file(path_to_test_anot_file)

In [20]:
global_all_train_files = train_all_train_files + test_all_train_files
global_all_validation_files = train_all_validation_files + test_all_validation_files
global_all_test_files = train_all_test_files + test_all_test_files

global_data_dict = {i: train_data_dict[i] + test_data_dict[i] for i in train_data_dict}

In [21]:
global_data_dict.keys()

dict_keys(['file_names', 'coordinates', 'superclasses', 'classes', 'labels'])

In [22]:
final_train_data_dict = {
                'file_names': [], 
                'coordinates': [], 
                'superclasses': [], 
                'classes': [], 
                'labels': []
}
final_validation_data_dict = {
                'file_names': [], 
                'coordinates': [], 
                'superclasses': [], 
                'classes': [], 
                'labels': []
}
final_test_data_dict = {
                'file_names': [], 
                'coordinates': [], 
                'superclasses': [], 
                'classes': [], 
                'labels': []
}

for src_d, dst_d in zip(
    [global_all_train_files, global_all_validation_files, global_all_test_files], 
    [final_train_data_dict, final_validation_data_dict, final_test_data_dict]):
    
    
    for file in src_d:
        relevant_index = global_data_dict['file_names'].index(file)

        dst_d['file_names'].append(global_data_dict['file_names'][relevant_index])
        dst_d['coordinates'].append(global_data_dict['coordinates'][relevant_index])
        dst_d['superclasses'].append(global_data_dict['superclasses'][relevant_index])
        dst_d['classes'].append(global_data_dict['classes'][relevant_index])
        dst_d['labels'].append(global_data_dict['labels'][relevant_index])

In [23]:
train_dataframe = pd.DataFrame(final_train_data_dict)
train_dataframe.to_csv(os.path.join(path_to_parsed_dataset, 'train_annotation.csv'))

validation_dataframe = pd.DataFrame(final_validation_data_dict)
validation_dataframe.to_csv(os.path.join(path_to_parsed_dataset, 'validation_annotation.csv'))

test_dataframe = pd.DataFrame(final_test_data_dict)
test_dataframe.to_csv(os.path.join(path_to_parsed_dataset, 'test_annotation.csv'))
