## Imports

In [23]:
import cv2
from skimage import io
import numpy as np
from matplotlib import pyplot as plt
import math
import time
import glob
from tqdm import tqdm

from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

from math import factorial

from google_images_download import google_images_download

## Variables

In [5]:
basedir = "/home/romain/Documents/SI3/SSII/classification-images/raw-img"
savedir = "/home/romain/Documents/SI3/SSII/classification-images/saves"

classes = ["pecora", "ragno"]
train_percentage = 0.8

train_amount = []
validation_amount = []
test_amount = []
for cl in classes:
    total = len(glob.glob(f"{basedir}/train/{cl}/*"))
    train_amount.append(int(train_percentage * total))
    validation_amount.append(int((1-train_percentage) * total))
    test_total = len(glob.glob(f"{basedir}/test/{cl}/*"))
    test_amount.append(int(test_total))
    
print(train_amount)
print(test_amount)
print(validation_amount)

[1164, 3084]
[364, 965]
[291, 771]


## Chargement des images

In [4]:
"""Définition des méthodes"""

def load_images(cls, amount=-1, reverse="false", mode="train"):
    
    """
    Renvoie un tableau des amount premières images d'une classe.
    
    Arguments:
        cls: une chaine de caractère pour la classe désirée.
        amount: le nombre d'images à charger.
        reverse: true si vous souhaitez partir de la fin.
        mode: train pour les images d'entrainement, test pour le set de tests
    """
    
    images = []
    list_images = glob.glob(basedir + "/" + mode + "/" + cls + "/*")
    total_range = []
    
    if amount == -1:
        if reverse:
            total_range = range(len(list_images)-1, -1, -1)
        else:
            total_range = range(len(list_images))
    else:
        if reverse:
            total_range = range(len(list_images)-1, len(list_images)-1-amount, -1)
        else:
            total_range = range(amount)
            
    for i in tqdm(total_range, desc="lecture des images de la classe " + cls):
        images.append(io.imread(list_images[i]))
        
    return images

In [83]:
"""Lecture des images en mémoire"""

images = {}

i = 0
for cl in classes:
    images[cl] = load_images(cl, train_amount[i])
    i += 1

lecture des images de la classe pecora: 100%|██████████| 1164/1164 [00:03<00:00, 332.44it/s]
lecture des images de la classe ragno: 100%|██████████| 3084/3084 [00:07<00:00, 440.05it/s]


In [84]:
"""Tests"""

for cl in classes:
    print(len(images[cl]))

1164
3084


## Création des SIFTs

In [85]:
"""Définition des méthodes"""

def build_sifts_from_image_list(images, sifts, sifts_per_image=[]):
    
    """
    Remplit le numpy array sifts passé en paramètres avec les SIFTs de toutes les images.
    
    Arguments:
         images: une liste d'images, dont on souhaite calculer les SIFTs.
         sifts: le numpy array à remplir
         sifts_per_image: le tableau auquel on ajoute le nombre de SIFTs de chaque image.
    """
    
    for image in tqdm(images, desc="construction des SIFTs des images"):
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        sift = cv2.SIFT_create()
        kp, des = sift.detectAndCompute(gray_image, None)
        sifts = np.append(sifts, des, axis=0)
        sifts_per_image.append(len(des))
        
    return sifts

In [86]:
"""Création des SIFTs à partir des images en mémoire"""

sifts = np.empty(shape = (0, 128))
sifts_per_image = []

for cl in classes:
    sifts = build_sifts_from_image_list(images[cl], sifts, sifts_per_image)

construction des SIFTs des images: 100%|██████████| 1164/1164 [04:36<00:00,  4.21it/s]
construction des SIFTs des images: 100%|██████████| 3084/3084 [32:24<00:00,  1.59it/s]


In [87]:
"""Tests"""

print(sifts.shape)
print(sifts_per_image)
print(sum(sifts_per_image))

(2715267, 128)
[3038, 430, 449, 325, 3037, 2147, 2059, 358, 375, 534, 825, 395, 577, 782, 702, 764, 775, 831, 793, 554, 615, 1067, 257, 410, 583, 914, 659, 3444, 323, 440, 311, 582, 735, 523, 290, 635, 480, 756, 670, 1988, 662, 832, 350, 203, 570, 728, 552, 237, 433, 412, 401, 202, 1308, 464, 3634, 716, 449, 305, 388, 1054, 765, 548, 257, 718, 409, 456, 484, 316, 3156, 638, 1028, 1437, 4211, 1429, 604, 746, 505, 294, 438, 546, 1210, 1957, 1548, 1429, 3622, 379, 403, 970, 498, 912, 471, 233, 535, 3197, 641, 434, 592, 654, 2535, 377, 712, 673, 615, 451, 935, 1761, 662, 2666, 3348, 2704, 616, 763, 986, 1302, 286, 1164, 2560, 497, 625, 1280, 436, 580, 248, 454, 564, 535, 526, 234, 1856, 643, 508, 608, 521, 1175, 616, 238, 1344, 362, 713, 486, 510, 256, 337, 482, 692, 367, 602, 1416, 449, 486, 282, 586, 746, 474, 630, 722, 942, 236, 1199, 943, 750, 673, 654, 610, 467, 2361, 518, 529, 412, 410, 613, 654, 534, 573, 537, 339, 330, 1040, 712, 478, 600, 2278, 562, 322, 551, 465, 522, 673, 839, 1

## Partitionnement en clusters & construction des BOWs

In [88]:
def build_clusters_list(sifts, k = 50):
    
    """
    Partitionne les SIFTs en k clusters et retourne le cluster auquel appartient chaque SIFT.
    
    Arguments:
        sifts: le numpy array de SIFTs à partitionner.
        k: le nombre de clusters souhaité.
    """
    
    for _ in tqdm(range(1), desc="Partitionnement en clusters"):
        kMeans = MiniBatchKMeans(n_clusters = k)
        kMeans.fit(sifts)
        return kMeans.labels_


def build_images_bows(labels, sifts_per_image, k = 50):
    
    """
    Construit un numpy array contenant les BOWs de chaque image.
    
    Arguments:
        labels: les labels de chaque SIFTs, CAD à quel cluster appartient chaque SIFT.
        sifts_per_image: le tableau décrivant le nombre de SIFT dans chaque image.
        k: le nombre de clusters.
    """
    
    all_bows = np.zeros(shape = (len(sifts_per_image), k))
    
    parcours = 0
    parcours_bis = 0
    for sift_amount in tqdm(sifts_per_image, desc="Construction des BOWs des images"):
        bow = np.zeros(k)
        for i in range(parcours + sift_amount):
            bow[labels[i]] += 1/sift_amount
        parcours += sift_amount
        all_bows[parcours_bis] = bow
        parcours_bis += 1
        
    return all_bows

In [89]:
"""Création des clusters"""

labels = build_clusters_list(sifts)

Partitionnement en clusters:   0%|          | 0/1 [02:02<?, ?it/s]


In [90]:
"""Construction des BOWs"""

all_bows = build_images_bows(labels, sifts_per_image)

Construction des BOWs des images: 100%|██████████| 4248/4248 [55:13<00:00,  1.28it/s]


In [91]:
"""Tests"""

print(all_bows.shape)
print(all_bows[0])
print(sum(all_bows[0]))

(4248, 50)
[0.0236998  0.02929559 0.02929559 0.01250823 0.01942067 0.01579987
 0.0236998  0.01974984 0.02501646 0.02073733 0.01876234 0.02797893
 0.00493746 0.0154707  0.01876234 0.02797893 0.01086241 0.03357472
 0.01119157 0.00954575 0.02699144 0.01349572 0.00888743 0.02435813
 0.02501646 0.00954575 0.00493746 0.02633311 0.03291639 0.01152074
 0.01152074 0.03982883 0.01974984 0.03291639 0.01020408 0.01053325
 0.02205398 0.01612903 0.02238315 0.02106649 0.0118499  0.0236998
 0.01974984 0.01810402 0.01711652 0.02501646 0.03028308 0.02633311
 0.020079   0.01909151]
0.9999999999999993


## Classification

In [10]:
"""Régression logistique puis classification"""

log_reg = LogisticRegression(max_iter = 100000)

truth = [0]*train_amount[0] + [1]*train_amount[1]
for _ in tqdm(range(1), desc="Classification"):
    log_reg.fit(all_bows, (truth))

Classification: 100%|██████████| 1/1 [00:00<00:00,  3.03it/s]


## Test de l'efficacité

In [121]:
"""Chargement des images de tests"""

test_images = {}

i = 0
for cl in classes:
    test_images[cl] = load_images(cl, test_amount[i], True, "test")
    i += 1

lecture des images de la classe pecora: 100%|██████████| 364/364 [00:01<00:00, 255.05it/s]
lecture des images de la classe ragno: 100%|██████████| 965/965 [00:01<00:00, 483.41it/s]


In [122]:
"""Construction des SIFTs des images de tests"""

test_sifts = np.empty(shape = (0, 128))
test_sifts_per_image = []

for cl in classes:
    test_sifts = build_sifts_from_image_list(test_images[cl], test_sifts, test_sifts_per_image)

construction des SIFTs des images: 100%|██████████| 364/364 [00:29<00:00, 12.22it/s]
construction des SIFTs des images: 100%|██████████| 965/965 [06:21<00:00,  2.53it/s]


In [123]:
"""Partitionnement en clusters"""

test_labels = build_clusters_list(test_sifts)

Partitionnement en clusters:   0%|          | 0/1 [01:20<?, ?it/s]


In [124]:
"""Construction des BoWs"""

test_all_bows = build_images_bows(test_labels, test_sifts_per_image)

Construction des BOWs des images: 100%|██████████| 1329/1329 [06:12<00:00,  3.57it/s]


In [11]:
"""Test du modèle pour les données de test (score correct attendu)"""

test_truth = [0]*test_amount[0] + [1]*test_amount[1]
resTrain = log_reg.predict(test_all_bows)
scoreTrain = f1_score((test_truth), resTrain)
print('score sur le set de validation: {:1.2f}%'.format(scoreTrain*100))

score sur le set de validation: 85.29%


In [12]:
"""Test du modèle pour les données d'apprentissage (score élevé attendu)"""

res = log_reg.predict(all_bows)
score = f1_score((truth), res)
print('score sur le set d\'entraînement: {:1.2f}%'.format(score*100))

score sur le set d'entraînement: 99.82%


## Classification polynomiale

In [166]:
def build_bows_second_degree(bows, k = 50):
    
    """
    Construit la liste des BOWs de degré 2 à partir d'une liste de BOWs.
    Argument:
        bows: la liste de BOWs.
        k: la taille de chaque BOW.
    """
    
    bows_second_degree = np.empty(shape = (0, 2*k + int(math.factorial(k) / (2 * math.factorial(k-2))) ))
    for bow in tqdm(bows, desc="Construction des BOWs de degré 2"):
        bow_second_degree = build_bow_second_degree(bow, k)
        bows_second_degree = np.append(bows_second_degree, [bow_second_degree], axis=0)
        
    return bows_second_degree


def build_bow_second_degree(bow, k = 50):
    bow_second_degree = np.empty(shape = 0)
    for i in range(k):
        bow_second_degree = np.append(bow_second_degree, bow[i])
        for j in range(i, k):
            bow_second_degree = np.append(bow_second_degree, bow[i] * bow[j])
            
    return bow_second_degree

In [167]:
"""Construction"""

all_bows_second_degree = build_bows_second_degree(all_bows)

Construction des BOWs de degré 2: 100%|██████████| 4248/4248 [01:12<00:00, 58.72it/s]


In [168]:
"""Tests"""

print(all_bows.shape)
print(all_bows_second_degree.shape)
print(all_bows_second_degree[0])

(4248, 50)
(4248, 1325)
[0.0236998  0.00056168 0.0006943  ... 0.00038334 0.01909151 0.00036449]


In [13]:
"""Régression logistique puis classification"""

log_reg_second_degree = LogisticRegression(max_iter = 100000)

truth_second_degre = [0]*train_amount[0] + [1]*train_amount[1]
for _ in tqdm(range(1), desc="Classification"):
    log_reg_second_degree.fit(all_bows_second_degree, (truth_second_degre))

Classification: 100%|██████████| 1/1 [00:17<00:00, 17.28s/it]


## Test du modèle

In [175]:
"""Construction"""

test_all_bows_second_degree = build_bows_second_degree(test_all_bows)

Construction des BOWs de degré 2: 100%|██████████| 1329/1329 [00:16<00:00, 83.04it/s]


In [14]:
"""Test du modèle pour les données de test"""

test_truth_second_degre = [0]*test_amount[0] + [1]*test_amount[1]

test_res_second_degre = log_reg_second_degree.predict(test_all_bows_second_degree)
test_score_second_degre = f1_score((test_truth_second_degre), test_res_second_degre)
print('score sur le set de tests: {:1.2f}%'.format(test_score_second_degre*100))

score sur le set de tests: 86.94%


In [15]:
"""Test du modèle pour les données d'apprentissage (score élevé attendu)"""

res_second_degre = log_reg_second_degree.predict(all_bows_second_degree)
score_second_degre = f1_score((truth_second_degre), res_second_degre)
print(f'score sur le set d\'entraînement: {score_second_degre*100}%')

score sur le set d'entraînement: 100.0%


## Chargement du set de validation

In [193]:
"""Chargement des images de validation"""

validation_images = {}

i = 0
for cl in classes:
    validation_images[cl] = load_images(cl, validation_amount[i], True)
    i += 1

lecture des images de la classe pecora: 100%|██████████| 291/291 [00:01<00:00, 288.57it/s]
lecture des images de la classe ragno: 100%|██████████| 771/771 [00:02<00:00, 345.49it/s]


In [194]:
"""Construction des SIFTs des images de validation"""

validation_sifts = np.empty(shape = (0, 128))
validation_sifts_per_image = []

for cl in classes:
    validation_sifts = build_sifts_from_image_list(validation_images[cl], validation_sifts, validation_sifts_per_image)

construction des SIFTs des images: 100%|██████████| 291/291 [00:58<00:00,  4.95it/s]
construction des SIFTs des images: 100%|██████████| 771/771 [06:40<00:00,  1.93it/s]


In [195]:
"""Partitionnement en clusters"""

validation_labels = build_clusters_list(validation_sifts)

Partitionnement en clusters:   0%|          | 0/1 [02:03<?, ?it/s]


In [196]:
"""Construction des BoWs"""

validation_all_bows = build_images_bows(validation_labels, validation_sifts_per_image)

Construction des BOWs des images: 100%|██████████| 1062/1062 [03:28<00:00,  5.09it/s]


In [197]:
"""Construction"""

validation_all_bows_second_degree = build_bows_second_degree(validation_all_bows)

Construction des BOWs de degré 2: 100%|██████████| 1062/1062 [00:12<00:00, 86.17it/s]


In [16]:
"""Test du modèle pour les données de validation"""

validation_truth_second_degre = [0]*validation_amount[0] + [1]*validation_amount[1]

validation_res_second_degre = log_reg_second_degree.predict(validation_all_bows_second_degree)
validation_score_second_degre = f1_score((validation_truth_second_degre), validation_res_second_degre)
print('score sur le set de validation: {:1.2f}%'.format(validation_score_second_degre*100))

score sur le set de validation: 90.07%


## Normalisation des BOWs

In [13]:
def build_normalized_bows(bows, k = 50):
    
    """
    Construit la liste des BOWs de degré 2 à partir d'une liste de BOWs.
    Argument:
        bows: la liste de BOWs.
        k: la taille de chaque BOW.
    """
    
    normalized_bows = np.empty(shape = (0, 50))
    for bow in tqdm(bows, desc="Construction des BOWs normalisés"):
        normalized_bow = build_normalized_bow(bow, k)
        normalized_bows = np.append(normalized_bows, [normalized_bow], axis=0)
        
    return normalized_bows


def build_normalized_bow(bow, k = 50):
    bow_second_degree = np.empty(shape = 0)
    
    somme = 0
    for i in range(k):
        somme += bow[i]
    moyenne = somme / k
    
    somme_bis = 0
    for i in range(k):
        somme_bis += (bow[i] - moyenne) ** 2
    variance = somme_bis / k
    
    for i in range(k):
        bow_second_degree = np.append(bow_second_degree, (bow[i] - moyenne) / variance)
            
    return bow_second_degree

In [14]:
"""Construction"""

all_normalized_bows = build_normalized_bows(all_bows)

Construction des BOWs normalisés: 100%|██████████| 4248/4248 [00:02<00:00, 2104.44it/s]


In [15]:
"""Tests"""

print(all_bows.shape)
print(all_normalized_bows.shape)
print(all_normalized_bows[0])

(4248, 50)
(4248, 50)
[  59.408778    149.26191199  149.26191199 -120.29748997   -9.30244211
  -67.44270527   59.408778     -4.01696364   80.55069188   11.83947177
  -19.87339905  128.11999811 -241.86349478  -72.72818374  -19.87339905
  128.11999811 -146.72488232  217.97313209 -141.43940385 -167.8667962
  112.2635627  -104.44105456 -178.43775314   69.97973494   80.55069188
 -167.8667962  -241.86349478  101.69260576  207.40217515 -136.15392538
 -136.15392538  318.39722302   -4.01696364  207.40217515 -157.29583926
 -152.01036079   32.98138565  -62.1572268    38.26686412   17.12495024
 -130.86844691   59.408778     -4.01696364  -30.44435599  -46.3007914
   80.55069188  165.1183474   101.69260576    1.26851483  -14.58792058]


In [16]:
"""Régression logistique puis classification"""

normalized_log_reg = LogisticRegression(max_iter = 100000)

normalized_truth = [0]*train_amount[0] + [1]*train_amount[1]
for _ in tqdm(range(1), desc="Classification"):
    normalized_log_reg.fit(all_normalized_bows, (normalized_truth))

Classification: 100%|██████████| 1/1 [00:00<00:00,  6.77it/s]


## Test du modèle

In [17]:
"""Construction"""

test_all_normalized_bows = build_normalized_bows(test_all_bows)

Construction des BOWs normalisés: 100%|██████████| 1329/1329 [00:00<00:00, 2379.13it/s]


In [19]:
"""Test du modèle pour les données de test"""

normalized_test_truth = [0]*test_amount[0] + [1]*test_amount[1]

normalized_test_res = normalized_log_reg.predict(test_all_normalized_bows)
normalized_test_score = f1_score((normalized_test_truth), normalized_test_res)
print('score sur le set de tests: {:1.2f}%'.format(normalized_test_score*100))

score sur le set de tests: 84.13%


In [20]:
"""Test du modèle pour les données d'apprentissage (score élevé attendu)"""

normalized_res = normalized_log_reg.predict(all_normalized_bows)
normalized_score = f1_score((normalized_truth), normalized_res)
print(f'score sur le set d\'entraînement: {normalized_score*100}%')

score sur le set d'entraînement: 96.22520793346129%


In [21]:
"""Construction"""

normalized_validation_all_bows = build_normalized_bows(validation_all_bows)

Construction des BOWs normalisés: 100%|██████████| 1062/1062 [00:00<00:00, 2376.19it/s]


In [22]:
"""Test du modèle pour les données de validation"""

normalized_validation_truth = [0]*validation_amount[0] + [1]*validation_amount[1]

normalized_validation_res = normalized_log_reg.predict(normalized_validation_all_bows)
normalized_validation_score_second_degre = f1_score((normalized_validation_truth), normalized_validation_res)
print('score sur le set de validation: {:1.2f}%'.format(normalized_validation_score_second_degre*100))

score sur le set de validation: 84.12%


## Sauvegarde des BOWs dans des fichiers

In [200]:
"""Sauvegarde"""

file = open(f"{savedir}/BoWs/bows_{classes[0]}_{classes[1]}_entrainement", "wb")
np.save(file, all_bows)
file.close()

file2 = open(f"{savedir}/BoWs/bows_{classes[0]}_{classes[1]}_validation", "wb")
np.save(file2, validation_all_bows)
file2.close()

file3 = open(f"{savedir}/BoWs/bows_{classes[0]}_{classes[1]}_test", "wb")
np.save(file3, test_all_bows)
file3.close()

file4 = open(f"{savedir}/BoWs/bows_{classes[0]}_{classes[1]}_entrainement_second_degré", "wb")
np.save(file4, all_bows_second_degree)
file4.close()

file5 = open(f"{savedir}/BoWs/bows_{classes[0]}_{classes[1]}_validation_second_degré", "wb")
np.save(file5, validation_all_bows_second_degree)
file5.close()

file6 = open(f"{savedir}/BoWs/bows_{classes[0]}_{classes[1]}_test_second_degré", "wb")
np.save(file6, test_all_bows_second_degree)
file6.close()

In [6]:
"""Chargement depuis un fichier"""

file = open(f"{savedir}/BoWs/bows_{classes[0]}_{classes[1]}_entrainement", "rb")
all_bows_from_file = np.load(file)
file.close()

file2 = open(f"{savedir}/BoWs/bows_{classes[0]}_{classes[1]}_validation", "rb")
validation_all_bows_from_file = np.load(file2)
file2.close()

file3 = open(f"{savedir}/BoWs/bows_{classes[0]}_{classes[1]}_test", "rb")
test_all_bows_from_file = np.load(file3)
file3.close()

file4 = open(f"{savedir}/BoWs/bows_{classes[0]}_{classes[1]}_entrainement_second_degré", "rb")
all_bows_second_degree_from_file = np.load(file4)
file4.close()

file5 = open(f"{savedir}/BoWs/bows_{classes[0]}_{classes[1]}_validation_second_degré", "rb")
validation_all_bows_second_degree_from_file = np.load(file5)
file5.close()

file6 = open(f"{savedir}/BoWs/bows_{classes[0]}_{classes[1]}_test_second_degré", "rb")
test_all_bows_second_degree_from_file = np.load(file6)
file6.close()

In [8]:
"""Remplacer les variables de base par celles des fichiers"""

all_bows = all_bows_from_file
validation_all_bows = validation_all_bows_from_file
test_all_bows = test_all_bows_from_file
all_bows_second_degree = all_bows_second_degree_from_file
validation_all_bows_second_degree = validation_all_bows_second_degree_from_file
test_all_bows_second_degree = test_all_bows_second_degree_from_file

## Bonus : test du modèle sur des images aléatoires (work in progress)