In [4]:
import os
import zipfile 


In [None]:
zip_file = 'archive (1).zip'

extract_to = os.getcwd()

with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("Unzipping completed!")


extracted_files = os.listdir(extract_to)
print("Extracted files and directories:", extracted_files) 


Unzipping completed!
Extracted files and directories: ['.bash_logout', '.bash_profile', '.bashrc', '.emacs', '.local', '.ipython', '.jupyter', '.bash_history', '.nv', '.ipynb_checkpoints', '.ssh', '.lesshst', '.python_history', 'ThaiOCR-TrainigSet-Link', 'training_data.txt', 'validation_data.txt', 'test_data.txt', 'validation.py', 'traintestvalid_split.py', 'trainingtestvalidjupyter.ipynb', 'ThaiEng_model.pth', 'ThaiEnglish_model.pth', 'training.ipynb', 'args.py', 'dataset.py', 'datasplit.py', 'untitled1.py', 'test.py', 'archive (1).zip', 'modeltraining.py', 'model.py', 'train.py', '__pycache__', 'splitdata.py', 'Untitled.ipynb', 'datasplitproject.ipynb', 'dataset', 'Untitled Folder']


In [6]:
import os
import argparse
from sklearn.model_selection import train_test_split

In [3]:
import os

def load_data(train_dir, test_dir, val_dir):
    """Loads the paths from the presplitted dataset 

    Args:
        train_dir (str): path to the training directory  
        test_dir (str): path to the test directory
        val_dir (str): path to the validation directory 

    Returns:
        tuple: A tuple of 3 dictionaries where each vehicle classes is the key and
        the values is list with all the images paths
    """
    train_files = {}
    val_files = {}
    test_files = {}
    
    categories = os.listdir(train_dir)
    
    for category in categories:
        category_train_path = os.path.join(train_dir, category)
        category_test_path = os.path.join(test_dir, category)
        category_val_path = os.path.join(val_dir, category)
        
        if not os.path.isdir(category_train_path):
            continue
        
        train_files[category] = []
        for f in os.listdir(category_train_path):
            file_path = os.path.join(category_train_path, f)
            if os.path.isfile(file_path):
                train_files[category].append(file_path)

        
        if os.path.isdir(category_test_path):
            test_files[category] = []
            for f in os.listdir(category_test_path):
                file_path = os.path.join(category_test_path, f)
                if os.path.isfile(file_path):
                    test_files[category].append(file_path)
        
        if os.path.isdir(category_val_path):
            val_files[category] = []
            for f in os.listdir(category_val_path):
                file_path = os.path.join(category_val_path, f)
                if os.path.isfile(file_path):
                    val_files[category].append(file_path)
    return train_files, val_files, test_files

train_dir = './dataset/train/'
test_dir = './dataset/test/'
val_dir = './dataset/validation/'
train_files, val_files, test_files = load_data(train_dir, test_dir, val_dir)

print("Training files for each class:")
for category, files in train_files.items():
    print(f"Category: {category}, Files: {len(files)}")

print("\nValidation files for each class:")
for category, files in val_files.items():
    print(f"Category: {category}, Files: {len(files)}")

print("\nTest files for each class:")
for category, files in test_files.items():
    print(f"Category: {category}, Files: {len(files)}")

def save_to_txt(file_dict, filename):
    with open(filename, 'w') as f:
        for category, files in file_dict.items():
            for file in files:
                f.write(f"{file}\n")

save_to_txt(train_files, 'training_data.txt')
save_to_txt(val_files, 'validation_data.txt')
save_to_txt(test_files, 'testing_data.txt')

print("Files saved.")


Training files for each class:
Category: bus, Files: 1315
Category: car, Files: 810
Category: motorcycle, Files: 1069
Category: train, Files: 1319
Category: truck, Files: 908

Validation files for each class:
Category: bus, Files: 176
Category: car, Files: 106
Category: motorcycle, Files: 145
Category: train, Files: 182
Category: truck, Files: 100

Test files for each class:
Category: bus, Files: 180
Category: car, Files: 107
Category: motorcycle, Files: 139
Category: train, Files: 182
Category: truck, Files: 100
Files saved.


In [4]:
import os
from sklearn.utils import resample

def upsample_category(file_list, target_size):
    """Upsamples a list of file paths in order to reach the target size by random 
    upsampling with sklearn resample 


    Args:
        file_list (list): List of images paths for each vehicle
        target_size (int): Target number of samples that the upsampling needs 
        to achieve for each class

    Returns:
        list: A new list of images paths which includes now the upsampled images
        in order to get the target size. 
    """
    
    return resample(file_list, replace=True, n_samples=target_size, random_state=42)

def balance_training_data(train_files):
    """creates an upsampled training dataset where all vehicles have the same number
    of images. 

    Args:
        train_files (dict): The train files have as a key the vehicle name and as a 
        value the number of images. 

    Returns:
        dict: A dictionary where all classes are balanced now
    """
    max_train_size = 0
    for category in train_files:
        if len(train_files[category]) > max_train_size:
            max_train_size = len(train_files[category])
    
    for category in train_files:
        if len(train_files[category]) < max_train_size:
            train_files[category] = upsample_category(train_files[category], max_train_size)
    
    return train_files

train_dir = './dataset/train/'
test_dir = './dataset/test/'
val_dir = './dataset/validation/'


train_files, val_files, test_files = load_data(train_dir, test_dir, val_dir)

train_files_balanced = balance_training_data(train_files)

print("Upsampled Training per class:")
for category, files in train_files_balanced.items():
    print(f"Category: {category}, Files: {len(files)}")

print("\nValidation files per class:")
for category, files in val_files.items():
    print(f"Category: {category}, Files: {len(files)}")

print("\nTest files per class:")
for category, files in test_files.items():
    print(f"Category: {category}, Files: {len(files)}")

save_to_txt(train_files_balanced, 'balanced_training_data.txt')
save_to_txt(val_files, 'validation_data.txt')
save_to_txt(test_files, 'testing_data.txt')

print("Upsampled train saved")


Upsampled Training per class:
Category: bus, Files: 1319
Category: car, Files: 1319
Category: motorcycle, Files: 1319
Category: train, Files: 1319
Category: truck, Files: 1319

Validation files per class:
Category: bus, Files: 176
Category: car, Files: 106
Category: motorcycle, Files: 145
Category: train, Files: 182
Category: truck, Files: 100

Test files per class:
Category: bus, Files: 180
Category: car, Files: 107
Category: motorcycle, Files: 139
Category: train, Files: 182
Category: truck, Files: 100
Upsampled train saved
