## Imports

### Librairies

In [12]:
from google.colab import drive
from PIL import Image
import os
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import random as r
import tensorflow.keras.layers as layers
import tf.keras.callbacks as callbacks
from tf.keras.models import load_model
from sklearn.metrics import confusion_matrix
import seaborn as sns

### Dataset depuis Google Drive

In [None]:
# Étape 1: Monter Google Drive
drive.mount('/content/drive')

# Étape 2: Installer unrar (si nécessaire, dépend de l'environnement Colab actuel)
!apt-get install unrar

# Étape 3: Décompresser le fichier .rar
!unrar x "/content/drive/My Drive/Dataset_cesi.rar" "/content/Dataset_cesi/"

### Constantes

In [18]:
#PATH_NO_PHOTO_FOLDERS = ['Dataset_cesi/Painting', 'Dataset_cesi/Schematics', 'Dataset_cesi/Text', 'Dataset_cesi/Sketch']
PATH_NO_PHOTO_FOLDERS = ['Dataset_cesi/Schematics']
PATH_PHOTO_FOLDER = 'Dataset_cesi/Photo'
HEIGHT = 256
WIDTH = 256
CHANNELS = 3
TRAIN_RATIO = 0.8
VAL_RATIO = 0.15
TEST_RATIO = 0.05
EPOCHS = 100
BATCH_SIZE = 32

## Dataset

### Métrics du dataset

#### Distribution de la taille des images ( 80s )

In [None]:
def analyze_image_sizes(folder_path):
    heights = []
    widths = []
    
    # Parcourir le dossier et lire chaque image
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            img_path = os.path.join(folder_path, filename)
            with Image.open(img_path) as img:
                width, height = img.size
                widths.append(width)
                heights.append(height)
    
    # Afficher les distributions des largeurs et hauteurs
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.hist(widths, bins=20, color='blue', alpha=0.7)
    plt.title('Distribution des largeurs')
    plt.xlabel('Largeur')
    plt.ylabel('Nombre d\'images')

    plt.subplot(1, 2, 2)
    plt.hist(heights, bins=20, color='green', alpha=0.7)
    plt.title('Distribution des hauteurs')
    plt.xlabel('Hauteur')
    
    plt.tight_layout()
    plt.show()

analyze_image_sizes(PATH_PHOTO_FOLDER)


#### Informations sur les images

In [None]:
def get_folder_info(folder):
    images = os.listdir(folder)
    length = len(images)
    return length
# Dossier photo
photo_length = get_folder_info(PATH_PHOTO_FOLDER)
print(f'Nombre d\'images dans le dossier photo: {photo_length}')

# Dossiers sans photo
no_photo_len = 0
no_photo_folders_len = []
for folder in PATH_NO_PHOTO_FOLDERS:
    no_photo_folder_len = get_folder_info(folder)
    no_photo_len += no_photo_folder_len
    no_photo_folders_len.append(no_photo_folder_len)
print(f'Nombre d\'images dans les dossiers sans photo: {no_photo_len}')
for i in range(len(PATH_NO_PHOTO_FOLDERS)):
    print(f'Nombre d\'images dans le dossier {PATH_NO_PHOTO_FOLDERS[i]}: {no_photo_folders_len[i]}')

### Chargement des données

In [None]:

def load_dataset(list_path_nophoto,path_photo):
    def load_folder(path_folder, label):
        # label = 0 pour les images sans photo, 1 pour les images avec photo
        data_folder = []
        label_folder = []
        for filename in os.listdir(path_folder):
            if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                img_path = os.path.join(path_folder, filename)
                with Image.open(img_path) as img:
                    img = img.resize((WIDTH, HEIGHT)) # Avoir la même taille
                    img = img.convert('RGB') # Avoir 3 channels
                    #img = np.array(img)
                    data_folder.append(img)
                    label_folder.append(label)
        data_folder = np.array(data_folder)
        label_folder = np.array(label_folder)
        return data_folder, label_folder
    
    x = []
    y = []
    
    # On ajoute les images avec photo
    data_folder, label_folder = load_folder(path_photo, 1)
    x.append(data_folder)
    y.append(label_folder)
    
    print(f'chargement des images avec photo: {data_folder.shape}')
    print(f'chargement des labels avec photo: {label_folder.shape}')
    
    # On ajoute les images sans photo
    for path_folder in list_path_nophoto:
        data_folder, label_folder = load_folder(path_folder, 0)
        x.append(data_folder)
        y.append(label_folder)
        print(f' Dataset : {path_folder} chargé avec {data_folder.shape} images')
        print(f' Labels : {path_folder} chargé avec {label_folder.shape} labels')
    
    x = np.concatenate(x)
    y = np.concatenate(y)
    return x, y
    
# On charge le dataset
x, y = load_dataset(PATH_NO_PHOTO_FOLDERS, PATH_PHOTO_FOLDER)

print('Charge du dataset terminée')

# On mélange les données
indices = np.arange(x.shape[0])
np.random.shuffle(indices)
x = x[indices]
y = y[indices]

print('Mélange des données terminé')

# On divise le dataset en train, validation et test
def split_dataset(x, y, train_ratio=TRAIN_RATIO, val_ratio=VAL_RATIO, test_ratio=TEST_RATIO):
    assert train_ratio + val_ratio + test_ratio == 1
    train_size = int(x.shape[0] * train_ratio)
    val_size = int(x.shape[0] * val_ratio)
    x_train = x[:train_size]
    y_train = y[:train_size]
    x_val = x[train_size:train_size+val_size]
    y_val = y[train_size:train_size+val_size]
    x_test = x[train_size+val_size:]
    y_test = y[train_size+val_size:]
    return (x_train, y_train), (x_val, y_val), (x_test, y_test)

data_train, data_val, data_test = split_dataset(x, y)

print(f'Données d\'entrainement: {data_train[0].shape}')
print(f'Données de validation: {data_val[0].shape}')
print(f'Données de test: {data_test[0].shape}')

### Visualisation des données

In [None]:
# Sélectionner un index aléatoire
random_index = r.randint(0, len(x) - 1)

# Récupérer l'image et le label correspondants
random_image = x[random_index]
random_label = 'Photo' if y[random_index] == 1 else 'No photo'


# Afficher l'image avec son label
plt.imshow(random_image)
plt.title(f'Label: {random_label}')
plt.axis('off')
plt.show()

## Modèle

### Charement d'un modèle préexistant

In [None]:
model = load_model("Librable1")

### Création du modèle

In [None]:
model = tf.keras.models.Sequential([
    layers.BatchNormalization(input_shape=(HEIGHT, WIDTH, CHANNELS)),
    layers.Conv2D(32, (3, 3), activation='leaky_relu', padding='same', strides=2),
    layers.Conv2D(64, (3, 3), activation='leaky_relu', padding='same', strides=2),
    layers.Conv2D(128, (3, 3), activation='leaky_relu', padding='same', strides=2),
    layers.Conv2D(256, (3, 3), activation='leaky_relu', padding='same', strides=2),
    layers.Flatten(),
    layers.Dense(128, activation='leaky_relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

### Entrainement du modèle

In [None]:
# Callback d'early stopping
early_callback = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Cycle d'entrainement
history = model.fit(data_train[0], data_train[1], batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(data_val[0], data_val[1]), callbacks=[early_callback])

### Courbe d'entrainement

In [None]:
# Plotting the training and validation loss
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Plotting the training and validation accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

### Sauvegarde du modèle

In [None]:
model.save("Librable1")

## Evaluation

### Matrice de confusion

In [None]:
# Prédictions sur les données de test
y_pred = model.predict(data_test[0])
y_pred_classes = (y_pred > 0.5).astype("int32")

# Calcul de la matrice de confusion
cm = confusion_matrix(data_test[1], y_pred_classes)

# Affichage de la matrice de confusion
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No photo', 'Photo'], yticklabels=['No photo', 'Photo'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

### Test unitaire sur l'ensemble de test

In [None]:
# Sélectionner un index aléatoire dans l'ensemble de test
random_index_test = r.randint(0, len(data_test[0]) - 1)

# Récupérer l'image et le label correspondant
random_image_test = data_test[0][random_index_test]
true_label = 'Photo' if data_test[1][random_index_test] == 1 else 'No photo'

# Faire l'inférence avec le modèle
predicted_label = 'Photo' if model.predict(np.expand_dims(random_image_test, axis=0))[0][0] > 0.5 else 'No photo'

# Afficher l'image avec son label réel et prédit
plt.imshow(random_image_test)
plt.title(f'True Label: {true_label}\nPredicted Label: {predicted_label}')
plt.axis('off')
plt.show()