# Data preprocessing

## Imports

In [1]:
import os
from PIL import Image
import shutil

## Preprocessing

#### Corrupte afbeeldingen verwijderen:

In [2]:
images_dir = "images"
painter_dirs = os.listdir(images_dir)
try:
    for painter in painter_dirs:
        painter_dir = os.path.join(images_dir, painter)
        images = os.listdir(painter_dir)
        for image in images:
            image_path = os.path.join(painter_dir, image)
            with Image.open(image_path) as img:
                pass
except Exception as e:
    os.remove(image_path)
    print(f"Removed {image_path} due to error: {e}")

### Andere formaten omzetten:

In [5]:
def get_formats():
    formats_count = {}
    other_format_images = {}
    for painter in painter_dirs:
        painter_dir = os.path.join(images_dir, painter)
        images = os.listdir(painter_dir)
        for image in images:
            image_path = os.path.join(painter_dir, image)
            with Image.open(image_path) as img:
                formats_count[img.format] = formats_count.get(img.format, 0) + 1
                if img.format != "JPEG":
                    if img.format not in other_format_images:
                        other_format_images[img.format] = []
                    other_format_images[img.format].append(image_path)
    return formats_count, other_format_images
formats_counts, other_format_images = get_formats()
formats_counts, other_format_images

({'JPEG': 2792, 'PNG': 2, 'MPO': 4, 'GIF': 2},
 {'PNG': ['images/Mondriaan/tableay-no-iv-lozenge-composition.png',
   'images/Picasso/picasso_meisje_met_duif.png'],
  'MPO': ['images/Picasso/0-picassofull-1.JPG',
   'images/Picasso/0-picassozoomhires.JPG',
   'images/Picasso/Kopie van 0-picassofull-1.JPG',
   'images/Picasso/Kopie van 0-picassozoomhires.JPG'],
  'GIF': ['images/Picasso/bust-of-woman-with-yellow-ribbon-jacqueline-1962.jpg',
   'images/Picasso/woman-with-hat-1962-1.jpg']})

In [6]:
for image_list in other_format_images.values():
    for image in image_list:
        img = Image.open(image)
        img = img.convert("RGB")
        img.save(image, "JPEG")
        print(f"Converted {image} to JPEG")
            

Converted images/Mondriaan/tableay-no-iv-lozenge-composition.png to JPEG
Converted images/Picasso/picasso_meisje_met_duif.png to JPEG
Converted images/Picasso/0-picassofull-1.JPG to JPEG
Converted images/Picasso/0-picassozoomhires.JPG to JPEG
Converted images/Picasso/Kopie van 0-picassofull-1.JPG to JPEG
Converted images/Picasso/Kopie van 0-picassozoomhires.JPG to JPEG
Converted images/Picasso/bust-of-woman-with-yellow-ribbon-jacqueline-1962.jpg to JPEG
Converted images/Picasso/woman-with-hat-1962-1.jpg to JPEG


In [7]:
get_formats()

({'JPEG': 2800}, {})

### Images hernoemen

In [8]:
for painter in painter_dirs:
    painter_dir = os.path.join(images_dir, painter)
    images = os.listdir(painter_dir)
    for i, image in enumerate(images):
        image_path = os.path.join(painter_dir, image)
        new_image_path = os.path.join(painter_dir, f"{i}.jpg")
        os.rename(image_path, new_image_path)

### Opslaan

In [10]:
shutil.make_archive("images", 'zip', images_dir)

'/mnt/c/Users/timmo/OneDrive - Hogeschool VIVES/Documenten/School 2024-2025/Deep Learning/Schilderijen classificeren/Schilderijen-Classificatie/images.zip'

## Dataset creatie

In [11]:
train = 0.6
validation = 0.2
test = 0.2

In [12]:
painter_image_count = {}
for painter in painter_dirs:
    painter_dir = os.path.join(images_dir, painter)
    images = os.listdir(painter_dir)
    painter_image_count[painter] = len(images)
painter_image_count

{'Mondriaan': 330, 'Picasso': 1529, 'rembrandt': 259, 'Rubens': 682}

In [26]:
def get_dataset_counts(dataset_dir):
    painter_dirs = os.listdir(dataset_dir)
    painter_image_count = {}
    for painter in painter_dirs:
        painter_dir = os.path.join(dataset_dir, painter)
        images = os.listdir(painter_dir)
        painter_image_count[painter] = len(images)
    return painter_image_count

### Imbalanced

In [None]:
dataset_name = "imbalanced_dataset"
os.makedirs(dataset_name)

for painter in painter_dirs:
    train_dir = os.path.join(dataset_name, "train", painter)
    os.makedirs(train_dir)
    validation_dir = os.path.join(dataset_name, "validation", painter)
    os.makedirs(validation_dir)
    test_dir = os.path.join(dataset_name, "test", painter)
    os.makedirs(test_dir)

    painter_dir = os.path.join(images_dir, painter)
    images = os.listdir(painter_dir)
    train_count = int(train * painter_image_count[painter])
    validation_count = int(validation * painter_image_count[painter])
    # test_count = int(test * painter_image_count[painter])

    for i, image in enumerate(images):
        image_path = os.path.join(painter_dir, image)
        if i < train_count:
            new_image_path = os.path.join(train_dir, image)
        elif i < train_count + validation_count:
            new_image_path = os.path.join(validation_dir, image)
        else:
            new_image_path = os.path.join(test_dir, image)
        shutil.copy(image_path, new_image_path)

In [28]:
train_dir = os.path.join(dataset_name, "train")
validation_dir = os.path.join(dataset_name, "validation")
test_dir = os.path.join(dataset_name, "test")

print(f"train: {get_dataset_counts(train_dir)}")
print(f"val: {get_dataset_counts(validation_dir)}")
print(f"test: {get_dataset_counts(test_dir)}")

train: {'Mondriaan': 198, 'Picasso': 917, 'rembrandt': 155, 'Rubens': 409}
val: {'Mondriaan': 66, 'Picasso': 305, 'rembrandt': 51, 'Rubens': 136}
test: {'Mondriaan': 66, 'Picasso': 307, 'rembrandt': 53, 'Rubens': 137}


### Undersampled

In [31]:
dataset_name = "undersampled_dataset"
min_images = min(painter_image_count.values())

for painter in painter_dirs:
    train_dir = os.path.join(dataset_name, "train", painter)
    os.makedirs(train_dir)
    validation_dir = os.path.join(dataset_name, "validation", painter)
    os.makedirs(validation_dir)
    test_dir = os.path.join(dataset_name, "test", painter)
    os.makedirs(test_dir)

    painter_dir = os.path.join(images_dir, painter)
    images = os.listdir(painter_dir)
    train_count = int(train * min_images)
    validation_count = int(validation * min_images)
    test_count = int(test * min_images)
    
    for i, image in enumerate(images):
        image_path = os.path.join(painter_dir, image)
        if i < train_count:
            new_image_path = os.path.join(train_dir, image)
        elif i < train_count + validation_count:
            new_image_path = os.path.join(validation_dir, image)
        elif i < train_count + validation_count + test_count:
            new_image_path = os.path.join(test_dir, image)
        shutil.copy(image_path, new_image_path)

In [32]:
train_dir = os.path.join(dataset_name, "train")
validation_dir = os.path.join(dataset_name, "validation")
test_dir = os.path.join(dataset_name, "test")

print(f"train: {get_dataset_counts(train_dir)}")
print(f"val: {get_dataset_counts(validation_dir)}")
print(f"test: {get_dataset_counts(test_dir)}")

train: {'Mondriaan': 155, 'Picasso': 155, 'rembrandt': 155, 'Rubens': 155}
val: {'Mondriaan': 51, 'Picasso': 51, 'rembrandt': 51, 'Rubens': 51}
test: {'Mondriaan': 51, 'Picasso': 51, 'rembrandt': 51, 'Rubens': 51}


### Oversampled

In [35]:
dataset_name = "oversampled_dataset"
max_images = max(painter_image_count.values())

for painter in painter_dirs:
    train_dir = os.path.join(dataset_name, "train", painter)
    os.makedirs(train_dir)
    validation_dir = os.path.join(dataset_name, "validation", painter)
    os.makedirs(validation_dir)
    test_dir = os.path.join(dataset_name, "test", painter)
    os.makedirs(test_dir)

    painter_dir = os.path.join(images_dir, painter)
    images = os.listdir(painter_dir)
    train_count = int(train * max_images)
    validation_count = int(validation * max_images)
    test_count = int(test * max_images)

    if len(images) < max_images:
        oversampled_images = []
        for i in range(max_images - len(images)):
            oversampled_images.append(images[i % len(images)])
        images += oversampled_images
    
    for i, image in enumerate(images):
        image_path = os.path.join(painter_dir, image)
        new_image_name = f"{i}.jpg"
        if i < train_count:
            new_image_path = os.path.join(train_dir, new_image_name)
        elif i < train_count + validation_count:
            new_image_path = os.path.join(validation_dir, new_image_name)
        elif i < train_count + validation_count + test_count:
            new_image_path = os.path.join(test_dir, new_image_name)
        shutil.copy(image_path, new_image_path)

In [36]:
train_dir = os.path.join(dataset_name, "train")
validation_dir = os.path.join(dataset_name, "validation")
test_dir = os.path.join(dataset_name, "test")

print(f"train: {get_dataset_counts(train_dir)}")
print(f"val: {get_dataset_counts(validation_dir)}")
print(f"test: {get_dataset_counts(test_dir)}")

train: {'Mondriaan': 917, 'Picasso': 917, 'rembrandt': 917, 'Rubens': 917}
val: {'Mondriaan': 305, 'Picasso': 305, 'rembrandt': 305, 'Rubens': 305}
test: {'Mondriaan': 305, 'Picasso': 305, 'rembrandt': 305, 'Rubens': 305}
