## Imports

### Librairies

In [None]:
#from google.colab import drive
from PIL import Image
import os
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import random as r
import tensorflow.keras.layers as layers
import tensorflow.keras.callbacks as callbacks
from tensorflow.keras.models import load_model
from sklearn.metrics import confusion_matrix
import seaborn as sns
from tensorflow.keras.utils import Sequence, plot_model

# Afficher le répertoire de travail actuel
print("Répertoire de travail actuel :", os.getcwd())

### Dataset depuis Google Drive

In [None]:
# Étape 1: Monter Google Drive
drive.mount('/content/drive')

# Étape 2: Installer unrar (si nécessaire, dépend de l'environnement Colab actuel)
!apt-get install unrar

# Étape 3: Décompresser le fichier .rar
!unrar x "/content/drive/My Drive/Dataset_cesi.rar" "/content/"

### Constantes et variables globals

In [None]:
PATH_NO_PHOTO_FOLDERS = ['Dataset_cesi/Painting', 'Dataset_cesi/Schematics', 'Dataset_cesi/Text', 'Dataset_cesi/Sketch']
#PATH_NO_PHOTO_FOLDERS = ['Dataset_cesi/Schematics']
PATH_PHOTO_FOLDER = 'Dataset_cesi/Photo'
HEIGHT = 256
WIDTH = 256
CHANNELS = 3

# L'ensemble doit être égale à 1
TRAIN_RATIO = 0.8
VAL_RATIO = 0.15
TEST_RATIO = 0.05
assert TRAIN_RATIO + VAL_RATIO + TEST_RATIO  == 1

# Hyperparamètres
EPOCHS = 100
BATCH_SIZE = 32
PATIENCE = 2

START_TRAIN = 0
STOP_TRAIN = TRAIN_RATIO
START_VAL = TRAIN_RATIO
STOP_VAL = START_VAL + VAL_RATIO
START_TEST = STOP_VAL
STOP_TEST = START_TEST + TEST_RATIO

print(f'distribution => train : [{START_TRAIN}:{STOP_TRAIN}] val : [{START_VAL}:{STOP_VAL}]  test : [{START_TEST}:{STOP_TEST}]')

indices = []


## Dataset

### Métrics du dataset

#### Distribution de la taille des images ( 80s )

In [None]:
def analyze_image_sizes(folder_path):
    heights = []
    widths = []
    
    # Parcourir le dossier et lire chaque image
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            img_path = os.path.join(folder_path, filename)
            with Image.open(img_path) as img:
                width, height = img.size
                widths.append(width)
                heights.append(height)
    
    # Afficher les distributions des largeurs et hauteurs
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.hist(widths, bins=20, color='blue', alpha=0.7)
    plt.title('Distribution des largeurs')
    plt.xlabel('Largeur')
    plt.ylabel('Nombre d\'images')

    plt.subplot(1, 2, 2)
    plt.hist(heights, bins=20, color='green', alpha=0.7)
    plt.title('Distribution des hauteurs')
    plt.xlabel('Hauteur')
    
    plt.tight_layout()
    plt.show()

analyze_image_sizes(PATH_PHOTO_FOLDER)

#### Informations sur les images

In [None]:
def get_folder_info(folder):
    images = os.listdir(folder)
    length = len(images)
    return length
# Dossier photo
photo_length = get_folder_info(PATH_PHOTO_FOLDER)
print(f'Nombre d\'images dans le dossier photo: {photo_length}')

# Dossiers sans photo
no_photo_len = 0
no_photo_folders_len = []
for folder in PATH_NO_PHOTO_FOLDERS:
    no_photo_folder_len = get_folder_info(folder)
    no_photo_len += no_photo_folder_len
    no_photo_folders_len.append(no_photo_folder_len)
print(f'Nombre d\'images dans les dossiers sans photo: {no_photo_len}')
for i in range(len(PATH_NO_PHOTO_FOLDERS)):
    print(f'Nombre d\'images dans le dossier {PATH_NO_PHOTO_FOLDERS[i]}: {no_photo_folders_len[i]}')

### Générateur du dataset

In [None]:
class DatasetGenerator(Sequence):
    def _getshuffle(self, lenght, start, stop):
        global indices
        if len(indices) == 0 :
            # On initialise les indices
            indices = np.arange(lenght)
            np.random.shuffle(indices)
        return np.array(indices[start:stop])
        
    def __init__(self, ensemble, **kwargs):
        super().__init__(**kwargs)
        # Récupère le chemin de toutes les images d'un dossier
        def find_paths(folder_path, label):
            paths = []
            for filename in os.listdir(folder_path):
                if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                    img_path = os.path.join(folder_path, filename)
                    paths.append(img_path)
            labels = [label] * len(paths)
            return paths, labels
        
        x_path, y = [], []
        
        # Chargement des chemins pour le dossier photo
        temp_x_path, temp_y = find_paths(PATH_PHOTO_FOLDER, 1)
        x_path += temp_x_path
        y += temp_y
        
        # Chargement des chemins pour les dossier no photo
        for path in PATH_NO_PHOTO_FOLDERS :
            temp_x_path, temp_y = find_paths(path, 0)
            x_path += temp_x_path
            y += temp_y
        
        # Concaténation des arrays
        lenght_dataset = len(y)
        # Selection de l'ensemble
        if ensemble == 'train' :
            start = int(START_TRAIN * lenght_dataset)
            stop = int(STOP_TRAIN * lenght_dataset)
        elif ensemble == 'val' :
            start = int(START_VAL * lenght_dataset)
            stop = int(STOP_VAL * lenght_dataset)
        elif ensemble == 'test' :
            start = int(START_TEST * lenght_dataset)
            stop = int(STOP_TEST * lenght_dataset)

        # Shuffle des données via l'indice
        self.indices = self._getshuffle(lenght_dataset, start, stop)
        self.x_path, self.y = x_path, np.array(y)
        
        # Affichage des informations
        print(f'Taille du générateur de l\'ensemble {ensemble} = {len(self)}')
        print(f'Nombre d\'images dans le générateur = {len(self.indices)}')
        count_photo_generator = np.sum(self.y[self.indices] == 1)
        count_no_photo_generator = np.sum(self.y[self.indices] == 0)
        print(f'Nombre de photos dans le générateur = {count_photo_generator}')
        print(f'Nombre de non photos dans le générateur = {count_no_photo_generator}')
              
    def __getitem__(self, index):
        start_index = index * BATCH_SIZE
        stop_index = (index + 1 ) * BATCH_SIZE
        chosen_indices = self.indices[start_index:stop_index]
        
        x, y = [], []
        
        # On récupère les images
        for indice in chosen_indices :
            indice_path = self.x_path[indice]
            indice_label = self.y[indice]
            with Image.open(indice_path) as img:
                img = img.resize((WIDTH, HEIGHT)) # Avoir la même taille
                img = img.convert('RGB') # Avoir 3 channels
                x.append(img)
            y.append(indice_label)
        
        x = np.array(x)
        y = np.array(y)
        
        return x, y
    
    def __len__(self):
        # Calcule le nombre de batch par epoch
        return int(np.ceil(len(self.indices) / BATCH_SIZE))
    
    def on_epoch_end(self):
        # Shuffle des indices
        np.random.shuffle(self.indices)

train_generator = DatasetGenerator('train', use_multiprocessing=True, workers=6)
print('---------------------------------')
val_generator = DatasetGenerator('val', use_multiprocessing=True, workers=6)
print('---------------------------------')
test_generator = DatasetGenerator('test', use_multiprocessing=True, workers=6)

### Visualisation des données

In [None]:
# Choissisez un générateur
generator = train_generator

r_index = r.randint(0, len(generator) - 1)
x, y = generator.__getitem__(r_index)
print(f'x shape: {x.shape}, y shape: {y.shape}')
r_index = r.randint(0, x.shape[0] - 1)
plt.imshow(x[r_index])
label = 'Photo' if y[r_index] == 1 else 'No photo'
plt.title(label)
plt.axis('off')
plt.show()

## Modèle

### Chargement d'un modèle préexistant

In [None]:
model = load_model("drive/My Drive/Livrable1.h5")
model.summary()

### Création du modèle

In [None]:
model = tf.keras.models.Sequential([
    layers.InputLayer(shape=(HEIGHT, WIDTH, CHANNELS)),
    layers.Rescaling(1./255),
    layers.Conv2D(32, (3, 3), padding='same', strides=2),
    layers.BatchNormalization(),
    layers.Activation('leaky_relu'),
    layers.Conv2D(64, (3, 3), padding='same', strides=2),
    layers.BatchNormalization(),
    layers.Activation('leaky_relu'),
    layers.Conv2D(128, (3, 3), padding='same', strides=2),
    layers.BatchNormalization(),
    layers.Activation('leaky_relu'),
    layers.Conv2D(256, (3, 3), padding='same', strides=2),
    layers.BatchNormalization(),
    layers.Activation('leaky_relu'),
    layers.Flatten(),
    layers.Dense(128),
    layers.BatchNormalization(),
    layers.Activation('leaky_relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
], name='photo_classifier')

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Plot the model architecture
plot_model(model, to_file=f'Livrable1_{model.name}.png', show_shapes=True, show_layer_names=True)


### Entrainement du modèle

In [None]:
# Callback d'early stopping
early_callback = callbacks.EarlyStopping(monitor='val_loss', patience=PATIENCE, restore_best_weights=True)

# Cycle d'entrainement
history = model.fit(train_generator, validation_data=val_generator, epochs=EPOCHS, callbacks=[early_callback])

### Courbe d'entrainement

In [None]:
# Plotting the training and validation loss
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Plotting the training and validation accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

### Sauvegarde du modèle

In [None]:
model.save(f'Livrable1_{model.name}.keras')

## Evaluation

### Matrice de confusion

In [None]:
# Prédictions
generator_type = input('Entrer l\'ensemble de données à tester (train, val, test) : ')
if generator_type == 'train' :
    generator = train_generator
elif generator_type == 'val' :
    generator = val_generator
elif generator_type == 'test' :
    generator = test_generator
res_pred = []
res_true = []
count = 0
# Predit batch par batch
for x, y in generator:
    y_pred = model.predict(x, verbose=0)
    y_pred = np.round(y_pred).flatten()
    y_pred = y_pred.tolist()
    res_pred += y_pred
    res_true += y.tolist()
    count += 1
    print(f'Batch {count} / {len(generator)}')
    if count == len(generator) :
        break
res_pred = np.array(res_pred)
res_true = np.array(res_true)

# Calcul de la matrice de confusion avec le titre de son ensemble
cm = confusion_matrix(res_true, res_pred)
tn, fp, fn, tp = cm.ravel()
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)


# Affichage de la matrice de confusion
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No photo', 'Photo'], yticklabels=['No photo', 'Photo'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title(f'Confusion Matrix - {generator_type} Acc: {precision:.2f}, Rec: {recall:.2f}, F1: {f1:.2f}')
plt.show()

### Test unitaire

In [None]:
generator_type = 'val'
if generator_type == 'train' :
    generator = train_generator
elif generator_type == 'val' :
    generator = val_generator
elif generator_type == 'test' :
    generator = test_generator

# On récupère une image aléatoirement depuis le générateur
r_index = r.randint(0, len(generator.indices) - 1)
img_path = generator.x_path[generator.indices[r_index]]
label = generator.y[generator.indices[r_index]]
label = 'Photo' if label == 1 else 'No photo'

# On charge l'image
with Image.open(img_path) as img:
    img = img.resize((WIDTH, HEIGHT)) # Avoir la même taille
    img = img.convert('RGB') # Avoir 3 channels

# On fait la prédiction
img = np.array(img)
img = np.expand_dims(img, axis=0)
prediction = model.predict(img)
pred_label = 'Photo' if prediction[0][0] > 0.5 else 'No photo'

# On affiche les résultats
plt.imshow(img[0])
plt.title(f'Label: {label}, Prediction: {pred_label}')
plt.axis('off')
plt.show()