## Image preparation

The original image size is 55x90 pixels with a color depth of 3 (RGB).
The below code can be used to transform the images in an input directory (Input_dir) to the right size (32x32 pixels) into an output directory (Output_dir). Inside the directory the pictures are stored in subdirectories according their labeling (0 ... 9 + NaN).
Any other image converter can be used as well.

### Prerequisite
Installed OpenCV libary within python (opencv)

In [1]:
import glob
import os
from PIL import Image 

Input_dir = 'data_raw_all'
Output_dir = 'data_resize_all'
Output_dir_neg = 'data_raw_all_neg'

target_size_x = 32
target_size_y = 32

In [2]:
files = glob.glob(Output_dir + '/*.jpg')
for f in files:
    os.remove(f)
print(str(len(files)) + " files have been deleted.")

0 files have been deleted.


In [3]:
files = glob.glob(Output_dir_neg + '/*.jpg')
for f in files:
    os.remove(f)
print(str(len(files)) + " files have been deleted.")

0 files have been deleted.


In [4]:
import cv2;
import numpy as np;
import hashlib

files = glob.glob(Input_dir + '/*.jpg')

for i,aktfile in enumerate(files):
    img_bgr = cv2.imread(aktfile)
    # Negate the original image 
    img_neg = 1 - img_bgr
    
    base = os.path.basename(aktfile)
    # Replace ".jpg" with "neg.jpg"
    name_neg = base.replace(".jpg", "_neg.jpg")
    save_name_neg = Output_dir_neg + '/' + name_neg

    cv2.imwrite(save_name_neg, img_neg)

    if i%500==0:
        print(i, save_name_neg)
    

0 data_raw_all_neg/0.0_0.0_neg.jpg
500 data_raw_all_neg/2.6_35a8c7850fdd0293ac7a2b8e7fa354b9_neg.jpg
1000 data_raw_all_neg/5.4_main_ana3_20221213-134708_neg.jpg
1500 data_raw_all_neg/8.5_4211_analog1_20200816-075704_neg.jpg


In [5]:
import hashlib

files = glob.glob(Output_dir_neg + '/*.jpg')
hashes={}
for i,aktfile in enumerate(files):
    if i%500==0:
        print(i, aktfile)
    test_image = Image.open(aktfile)
    hash=hashlib.sha256(test_image.tobytes()).hexdigest()
    if hash in hashes:
        hashes[hash].append(aktfile)
    else:
        hashes[hash]=[aktfile]
    test_image = test_image.resize((target_size_x, target_size_y), Image.NEAREST)
    base = os.path.basename(aktfile)
    save_name = Output_dir + '/' + base
    test_image.save(save_name, "JPEG", quality = 100)

0 data_raw_all_neg\0.0_0.0_neg.jpg
500 data_raw_all_neg\2.6_35a8c7850fdd0293ac7a2b8e7fa354b9_neg.jpg
1000 data_raw_all_neg\5.4_neg.jpg
1500 data_raw_all_neg\8.5_4211_analog1_20200816-075704_neg.jpg


In [6]:
import hashlib

files = glob.glob(Input_dir + '/*.jpg')
hashes={}
for i,aktfile in enumerate(files):
    if i%500==0:
        print(i, aktfile)
    test_image = Image.open(aktfile)
    hash=hashlib.sha256(test_image.tobytes()).hexdigest()
    if hash in hashes:
        hashes[hash].append(aktfile)
    else:
        hashes[hash]=[aktfile]
    test_image = test_image.resize((target_size_x, target_size_y), Image.NEAREST)
    base = os.path.basename(aktfile)
    save_name = Output_dir + '/' + base
    test_image.save(save_name, "JPEG", quality = 100)

0 data_raw_all\0.0_0.0.jpg
500 data_raw_all\2.6_35a8c7850fdd0293ac7a2b8e7fa354b9.jpg
1000 data_raw_all\5.4_main_ana3_20221213-134708.jpg
1500 data_raw_all\8.5_4211_analog1_20200816-075704.jpg


# Removing duplicate files

In [7]:
# duplicate files are a risk to the metrics, they pollute the validation dataset
for hash in hashes:
    if len(hashes[hash])>1:
        print(hashes[hash])    
        for duplicate in hashes[hash][1:]:
            # remove all except the first
            os.remove(duplicate)    