# Récupération et Préparation des données

|Auteur|Centre|
|---|---|
|ACQUART Quentin|Aix-en-Provence|
|DIMEGLIO Nicolas|Aix-en-Provence|
|SIXDENIER Alexandre|Aix-en-Provence|
|VESSERON Alexandre|Aix-en-Provence|
|ROMANO Sébastien|Aix-en-Provence|

## Import des différentes bibliothèques

In [1]:
import zipfile
import os
import shutil
import random
from PIL import Image as Image

## Téléchargement des données et dezippage


1. Téléchargez les fichiers zip [Datasets Livrable 1](https://cesifr-my.sharepoint.com/personal/bcohen_cesi_fr/_layouts/15/onedrive.aspx?id=%2Fpersonal%2Fbcohen%5Fcesi%5Ffr%2FDocuments%2FOption%20Data%20Science%2FDataset%20projet&originalPath=aHR0cHM6Ly9jZXNpZnItbXkuc2hhcmVwb2ludC5jb20vOmY6L2cvcGVyc29uYWwvYmNvaGVuX2Nlc2lfZnIvRW1na3k5Sm4xQnhHbE84TzZVMDVpYThCSEhkd2JfR0hFd1E3MVNkZTBqbjZDQT9ydGltZT1MS2hHamJ5QjJVZw)
Veuillez déposer ces fichiers Zip dans le répertoire `../Dataset/Project_Dataset_Zip`
<a id='section_1'></a>
### Architecture des dossiers :
- >../Dataset
     - >/Project_Dataset_Zip


2. Respectez l'architecture ci-dessus puis lancez le jupyter
<br><br>
<div style="color:red">Attention cette action peut être longue <b>(+1h)</b></div>

In [2]:
#Path
dataset_dir_path = "../Dataset"
zip_dataset_path = "../Dataset/Project_Dataset_Zip"
extracted_dataset_path = "../Dataset/Project_Dataset_Unzip"
clean_dataset_path = "../Dataset/Project_Dataset_Clean"
light_dataset_path = "../Dataset/Project_Dataset_Test"
dataset_size = 0.2

In [3]:
#Extraction des dossiers Zip dans ../Dataset/Project_Dataset_Unzip
def extract_zip(zip_path,extract_path):
    if not os.path.exists(extract_path):
        os.mkdir(extract_path)
    for directory in os.listdir(zip_path):
        print(directory)
        with zipfile.ZipFile(zip_path + "/" + directory, 'r') as zip_ref:
            zip_ref.extractall(extract_path)

## Première préparation des données afin de les rendre exploitables
Nettoyage du jeu de données en ouvrant chaque image afin de la copier sans métadata dans `../Dataset/Project_Dataset_Clean`
<br>Toute les images corrompues/invalides ne seront pas copiés et un fichier de log `Error_file.log` sera généré dans le dossier `..\Dataset`

In [4]:
def clean_dataset(dataset_path,extract_path):
    for directory in os.listdir(extract_path):
        print(extract_path + "/" + directory)
        if not os.path.exists(dataset_path):
            os.mkdir(dataset_path)
        if not os.path.exists(dataset_path + "/" + directory):
            os.mkdir(dataset_path + "/" + directory)
        for file in os.listdir(extract_path + "/" + directory):
            if not os.path.exists(dataset_path + "/"+ directory + "/" + file):
                image = None
                try:
                    image = Image.open(extract_path + "/" + directory + "/" + file)
                    data = list(image.getdata())
                    image_without_exif = Image.new(image.mode, image.size)
                    image_without_exif.putdata(data)
                    image_without_exif.save(clean_dataset_path + "/" + directory + "/" + file)
                except:
                    if file == "desktop.ini":
                        os.remove(extract_path + "/" + directory + "/" + file)
                    else:
                        print(extract_path + "/" + directory + "/" + file)
                        print(image)
                        if not os.path.exists("../Dataset/Error_file.log"):
                            fp = open('../Dataset/Error_file.log','w')
                            fp.close()
                        fp = open('../Dataset/Error_file.log','a')
                        fp.write('\n'+extract_path + "/" + directory + "/" + file)
                        fp.write('\n'+str(image))
                        fp.close()
                        image.close()
                        #os.remove(extract_path + "/" + directory + "/" + file)
                        try:
                            os.remove(dataset_path + "/" + directory + "/" + file)
                        except:
                            print("Failed to erase image"+"/"+directory+"/"+file)
                    pass

## Data Set de test Intermédiaire
Création d'un dataset plus léger, en récupérant aléatoirement des images dans le dataset
<br> Par défaut la valeur est de 20%.
<br> Pour changer la taille du dataset veuillez éditer la variable `dataset_size`

In [5]:
def dataset_test(light_dataset_path,dataset_path,dataset_size = 0.2):
    if not os.path.exists(light_dataset_path):
        os.mkdir(light_dataset_path)
    for directory in os.listdir(dataset_path):
        for file in os.listdir(dataset_path + "/" + directory):
            random_number = random()
            if not os.path.exists(light_dataset_path + "/" + directory):
                os.mkdir(light_dataset_path + "/" + directory)
            if random_number < dataset_size :
                shutil.copy2(dataset_path + "/" + directory + "/" + file, light_dataset_path + "/" + directory)

Code pour supprimer le `light_dataset` afin de le regénérer

In [6]:
#Uncomment to clear light_dataset
def remove_data_set(dataset_path):
    for directory in os.listdir(dataset_path):
        for file in os.listdir(dataset_path + "/" + directory):
            os.remove(dataset_path + "/" + directory + "/" + file)

In [7]:
binary_dataset_dir_path = "../Dataset_Binary_Project"
dataset_to_extract_path = "../Dataset/Project_Dataset_Clean"
binary_dataset_test_path = "../Dataset_Binary_Project_test"
class_to_compare = "Photo"

In [8]:
def create_binary_dataset(dataset_to_extract,binary_dataset_dir_path, class_to_compare):
    if not os.path.exists(binary_dataset_dir_path):
        os.mkdir(binary_dataset_dir_path)

    for directory in os.listdir(dataset_to_extract):
        print(dataset_to_extract + "/" + directory)
        if directory == class_to_compare:
            continue

        print("Check directory " + binary_dataset_dir_path + "/" + directory + "_" + class_to_compare)
        if not os.path.exists(binary_dataset_dir_path + "/" + directory + "_" + class_to_compare):
            os.mkdir(binary_dataset_dir_path + "/" + directory + "_" + class_to_compare)
            print("Create directory " + binary_dataset_dir_path + "/" + directory + "_" + class_to_compare)

        print("Check directory " + binary_dataset_dir_path + "/" + directory + "_" + class_to_compare + "/" + directory)
        if not os.path.exists(binary_dataset_dir_path + "/" + directory + "_" + class_to_compare + "/" + directory):
            os.mkdir(binary_dataset_dir_path + "/" + directory + "_" + class_to_compare + "/" + directory)
            print("Create directory " + binary_dataset_dir_path + "/" + directory + "_" + class_to_compare + "/" + directory)

        print("Check directory " + binary_dataset_dir_path + "/" + directory + "_" + class_to_compare + "/" + class_to_compare)
        if not os.path.exists(binary_dataset_dir_path + "/" + directory + "_" + class_to_compare + "/" + class_to_compare):
            os.mkdir(binary_dataset_dir_path + "/" + directory + "_" + class_to_compare + "/" + class_to_compare)
            print("Create directory " + binary_dataset_dir_path + "/" + directory + "_" + class_to_compare + "/" + class_to_compare)

        print("Copy file in " + binary_dataset_dir_path + "/" + directory + "_" + class_to_compare + "/" + directory)
        for file in os.listdir(dataset_to_extract + "/" + directory):
            if not os.path.exists(binary_dataset_dir_path + "/" + directory + "_" + class_to_compare + "/" + directory + "/" + file):
                shutil.copy2(dataset_to_extract + "/" + directory + "/" + file, binary_dataset_dir_path + "/" + directory + "_" + class_to_compare + "/" + directory)
        print("Finished copy")

        print("Copy file in " + binary_dataset_dir_path + "/" + directory + "_" + class_to_compare + "/" + class_to_compare)
        for file in os.listdir(dataset_to_extract + "/" + class_to_compare):
            if not os.path.exists(binary_dataset_dir_path + "/" + directory + "_" + class_to_compare + "/" + class_to_compare + "/" + file):
                shutil.copy2(dataset_to_extract + "/" + class_to_compare + "/" + file, binary_dataset_dir_path + "/" + directory + "_" + class_to_compare + "/" + class_to_compare)
        print("Finished copy")

In [9]:
def create_binary_dataset_test(dataset_to_extract,binary_dataset_test, class_to_compare, balanced=False):
    if not os.path.exists(binary_dataset_test):
        os.mkdir(binary_dataset_test)

    if not os.path.exists(binary_dataset_test + "/" + class_to_compare):
        os.mkdir(binary_dataset_test + "/" + class_to_compare)

    if not os.path.exists(binary_dataset_test + "/all_pictures"):
        os.mkdir(binary_dataset_test + "/all_pictures")

    nb_classes = len(os.listdir(dataset_to_extract))
    half_dataset = len(os.listdir(dataset_to_extract+"/"+class_to_compare))
    nb_files_by_directory = int(half_dataset/(nb_classes-1))

    if balanced:
        print("Creating Balanced dataset with " + str(nb_files_by_directory) + " files per class")
    else:
        print("Creating Unbalanced dataset")

    for directory in os.listdir(dataset_to_extract):
        print(dataset_to_extract + "/" + directory)

        if directory == class_to_compare:
            print("Skipping class : ",class_to_compare)
            continue

        if balanced :
            compteur = 0
            print("Copy file in " + binary_dataset_test + "/all_pictures")
            for file in shuffle_dataset(os.listdir(dataset_to_extract + "/" + directory))[:nb_files_by_directory-1]:
                if not os.path.exists(binary_dataset_test + "/all_pictures/" + file):
                    shutil.copy2(dataset_to_extract + "/" + directory + "/" + file, binary_dataset_test + "/all_pictures")
                    compteur += 1
            print("Finished copy of : " + str(compteur) + " files")
        else:
            print("Copy file in " + binary_dataset_test + "/all_pictures")
            for file in os.listdir(dataset_to_extract + "/" + directory):
                if not os.path.exists(binary_dataset_test + "/all_pictures/" + file):
                    shutil.copy2(dataset_to_extract + "/" + directory + "/" + file, binary_dataset_test + "/all_pictures")
                    compteur += 1
            print("Finished copy of : " + str(compteur) + " files")

    compteur = 0
    print("Copy file in " + binary_dataset_test + "/" + class_to_compare)
    for file in os.listdir(dataset_to_extract + "/" + class_to_compare):
        if not os.path.exists(binary_dataset_test + "/" + class_to_compare + "/" + file):
            shutil.copy2(dataset_to_extract + "/" + class_to_compare + "/" + file, binary_dataset_test + "/" + class_to_compare)
            compteur += 1
    print("Finished copy of : " + str(compteur) + " files")

def get_random():
    return 0.1

def shuffle_dataset(directory):
    random.shuffle(directory, get_random)
    return directory

#### Livrable 1

Les fonction ci-dessous nous permet de nettoyer et extraire les données du premier jeu de données pour le livrable 1.

In [10]:
extract_zip(zip_dataset_path, extracted_dataset_path)

FileNotFoundError: [WinError 3] The system cannot find the path specified: '../Dataset/Project_Dataset_Zip'

In [None]:
clean_dataset(clean_dataset_path,extracted_dataset_path)

Création du dataset de test

In [None]:
dataset_test(light_dataset_path,clean_dataset_path,dataset_size)

Création des datasets pour les binary classifieurs ainsi que le dataset de test.

In [None]:
create_binary_dataset(dataset_to_extract_path,binary_dataset_dir_path,class_to_compare)
create_binary_dataset_test(dataset_to_extract_path,binary_dataset_test_path,class_to_compare, balanced=True)

#### Livrable 2
La fonction ci-dessous nous permet de nettoyer et extraire les données du second jeu de données pour le livrable 2.

In [None]:
extract_zip('../DatasetL2/Project_Dataset_Zip/', '../DatasetL2/Project_Dataset_Unzip')

In [None]:
#clean_dataset('../DatasetL2/Project_Dataset_Clean','../DatasetL2/Project_Dataset_Unzip')