# **Funzioni utili**
Definiamo alcune funzioni per la suddivisione ed il calcolo di alcune statistiche dei dataset


In [1]:
import os
from PIL import Image
import random
from math import floor
%run path.ipynb


### **1. Statistiche IAM**
Questa funzione scorre tutto il dataset IAM e calcola le dimensioni massime raggiunte dalle immagini.

In [5]:
def get_IAM_statistics():
    h = []
    w = []
    # images = []
    main_dir = IAM + '/sentences'
    for dir in os.listdir(main_dir):
        dir_path = os.path.join(main_dir, dir)
        for subdir in os.listdir(dir_path):
            subdir_path = os.path.join(dir_path, subdir)
            for image in os.listdir(subdir_path):
                img_path = os.path.join(subdir_path, image)
                img = Image.open(img_path).convert('RGB')
                # images.append(resize(pil_to_tensor(img) / 255., (128, 1024)))
                img_size = img.size
                h.append(img_size[1])
                w.append(img_size[0])
    # images = torch.stack(images, dim=0)
    # mean = torch.mean(images)
    # std = torch.std(images)

    # print(mean, std)
    
    return max(w), max(h)

### **2. Statistiche Dysgraphia**
Questa funzione scorre tutto il dataset Dysgraphia e calcola le dimensioni massime raggiunte dalle immagini.

In [6]:
def get_base_statistics(aug : str = 'not_aug'):
    h = []
    w = []
    
    if aug == 'aug':
        main_dir = ADYSG
    else:
        main_dir = DYSG
    
    for dir in os.listdir(main_dir):
        dir_path = os.path.join(main_dir, dir)
        for image in os.listdir(dir_path):
            img_path = os.path.join(dir_path, image)
            img = Image.open(img_path).convert('RGB')
            # images.append(resize(pil_to_tensor(img) / 255., (128, 1024)))
            img_size = img.size
            h.append(img_size[1])
            w.append(img_size[0])
    # images = torch.stack(images, dim=0)
    # mean = torch.mean(images)
    # std = torch.std(images)

    # print(mean, std)
    
    return max(w), max(h)

### **3. Suddivisione Dysgraphia**
Questa funzione suddivide il dataset etichettato Dysgraphia in train, test e validation set e li salva in file di testo, successivamente considerati dall'architettura.

In [3]:
def create_simple_splits(data_path : str, aug : str, train_split = 0.7, val_split = 0.15):
    if os.path.isdir(os.path.join(SPLIT, f'train_{aug}.txt')):
        return
    else:
        print("Creating Simple splits.")
    
    dys = []
    not_dys = []
    
    for path in os.listdir(data_path):
        if "No_Dysgraphic" in path:
            not_dys = not_dys + [data_path+"/"+path+"/"+filename for filename in os.listdir(data_path + "/" + path)]
        else:
            dys = dys + [data_path+"/"+path+"/"+filename for filename in os.listdir(data_path + "/" + path)]

    random.shuffle(dys)
    random.shuffle(not_dys)

    # Calcolliamo le dimensioni dei set
    dys_train_size = floor(train_split * len(dys))
    dys_val_size = floor(val_split * len(dys))
    
    non_dys_train_size = floor(train_split * len(not_dys))
    non_dys_val_size = floor(val_split * len(not_dys))
    
    # Suddividiamo il dataset
    dys_train = dys[:dys_train_size]
    dys_val = dys[dys_train_size:dys_train_size + dys_val_size]
    dys_test = dys[dys_train_size + dys_val_size:]
    
    not_dys_train = not_dys[:non_dys_train_size]
    not_dys_val = not_dys[non_dys_train_size:non_dys_train_size + non_dys_val_size]
    not_dys_test = not_dys[non_dys_train_size + non_dys_val_size:]

    # Uniamo i set dys e not_dys
    train_set = dys_train + not_dys_train
    val_set = dys_val + not_dys_val
    test_set = dys_test + not_dys_test
    
    # Mescoliamo ulteriormente i set finali
    random.shuffle(train_set)
    random.shuffle(val_set)
    random.shuffle(test_set)

    with open(os.path.join(SPLIT, f'train_{aug}.txt'), 'w') as f:
        for t in train_set:
            f.write(f"{t}\n")
    
    with open(os.path.join(SPLIT, f'val_{aug}.txt'), 'w') as f:
        for t in val_set:
            f.write(f"{t}\n")

    with open(os.path.join(SPLIT, f'test_{aug}.txt'), 'w') as f:
        for t in test_set:
            f.write(f"{t}\n")

# dataset_path = DYSG
# create_simple_splits(dataset_path, 'no_aug')

Creating Simple splits.
