In [2]:
import os
import cv2
import pandas as pd

# Analyse de la base de données d'entraînement

In [3]:
# Liste des répertoires contenant les images
image_dirs = ['train/aeroplane', 'train/bicycle', 'train/bus', 'train/car', 'train/horse', 
              'train/knife','train/motorcycle', 'train/person', 'train/plant', 'train/skateboard','train/train','train/truck']

In [4]:
image_data = []
for image_dir in image_dirs:
    
    image_files = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))]

    for image_file in image_files:
        image_path = os.path.join(image_dir, image_file)
        img = cv2.imread(image_path)
        height, width, _ = img.shape
        image_data.append({'file': image_file, 'width': width, 'height': height, 'dir': image_dir})

df = pd.DataFrame(image_data)

In [7]:
df['dir'] = df['dir'].str.replace('train/', '', regex=False)

In [11]:
df.rename(columns={'dir': 'object_type'}, inplace=True)

In [12]:
df.head(5)

Unnamed: 0,file,width,height,object_type
0,src_2_02691156_1d6afc44b053ab07941d71475449eb2...,384,216,aeroplane
1,src_1_02691156_7f09b3b11ae3f22dbe13ce34aa7c0c1...,384,216,aeroplane
2,src_2_02691156_7aac10eb5b285098774a720fca15e35...,384,216,aeroplane
3,src_2_02691156_1b0b1d2cb9f9d5c0575bd26acccafab...,384,216,aeroplane
4,src_2_02691156_4def53f149137451b0009f08a96f38a...,384,216,aeroplane


In [14]:
df.describe()

Unnamed: 0,width,height
count,152397.0,152397.0
mean,383.998898,216.001102
std,0.430349,0.430349
min,216.0,216.0
25%,384.0,216.0
50%,384.0,216.0
75%,384.0,216.0
max,384.0,384.0


 Le dataframe comporte 152397 observations et donc autant pour l'entrainement du modèle. La largeur et longueur minimales des
 images est de 216pixels et au maximum 384.

# Analyse de la base de données de test

In [26]:
trunk_list = ['test/trunk']*20

for i in range(0,20):
    if i+1 <10:
        trunk_list[i] += '0'+ str(i+1)
    else :
        trunk_list[i] += str(i+1)

print(trunk_list)

['test/trunk01', 'test/trunk02', 'test/trunk03', 'test/trunk04', 'test/trunk05', 'test/trunk06', 'test/trunk07', 'test/trunk08', 'test/trunk09', 'test/trunk10', 'test/trunk11', 'test/trunk12', 'test/trunk13', 'test/trunk14', 'test/trunk15', 'test/trunk16', 'test/trunk17', 'test/trunk18', 'test/trunk19', 'test/trunk20']


In [27]:
image_data = []
for image_dir in trunk_list:
    
    image_files = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))]

    for image_file in image_files:
        image_path = os.path.join(image_dir, image_file)
        img = cv2.imread(image_path)
        height, width, _ = img.shape
        image_data.append({'file': image_file, 'width': width, 'height': height, 'dir': image_dir})

df_test = pd.DataFrame(image_data)

In [28]:
df_test.head(5)

Unnamed: 0,file,width,height,dir
0,aee27c09dc17c298fb661b36d3387511.jpg,479,359,test/trunk01
1,77bd5c0a0ee6095a1ef6b75a8e0cc713.jpg,94,148,test/trunk01
2,cd4427fb3c7b2a782a7dd1539d5f0f8a.jpg,1279,719,test/trunk01
3,24d1d1368947149b25e7d6003fd6f846.jpg,273,239,test/trunk01
4,6d269505fc9f6583cb6b4e2d96046821.jpg,1279,226,test/trunk01


In [29]:
df_test.describe()

Unnamed: 0,width,height
count,72372.0,72372.0
mean,406.713232,287.794009
std,293.804057,158.228341
min,70.0,70.0
25%,201.0,175.0
50%,338.0,258.0
75%,479.0,359.0
max,1279.0,719.0


Le jeu de données de test comporte 72372 images. La taille des images est beaucoup plus varié que pour le jeu de données de test. En effet on remarque que la largeur de l'image peut aller de 70 à 127pixels et la longuer de 70 à 719 de pixels. De plus nous avons une variance de 293,8 pour la largeur et de 158,2 pour la longueur ce qui est beaucoup plus que les 0,43 observés précedemment.

# Analyse du jeu de validation

# Liste des répertoires contenant les images

In [33]:
image_dirs_val = ['validation/aeroplane', 'validation/bicycle', 'validation/bus', 'validation/car', 'validation/horse',
               'validation/knife','validation/motorcycle', 'validation/person', 'validation/plant', 
               'validation/skateboard','validation/train','validation/truck']

In [35]:
image_data = []
for image_dir in image_dirs_val:
    
    image_files = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))]

    for image_file in image_files:
        image_path = os.path.join(image_dir, image_file)
        img = cv2.imread(image_path)
        height, width, _ = img.shape
        image_data.append({'file': image_file, 'width': width, 'height': height, 'dir': image_dir})

df_val = pd.DataFrame(image_data)

In [36]:
df_val.head(5)

Unnamed: 0,file,width,height,dir
0,aeroplane_157679.jpg,640,318,validation/aeroplane
1,aeroplane_156567.jpg,640,349,validation/aeroplane
2,aeroplane_159898.jpg,586,225,validation/aeroplane
3,aeroplane_156573.jpg,151,98,validation/aeroplane
4,aeroplane_1793775.jpg,143,150,validation/aeroplane


In [37]:
df_val.describe()

Unnamed: 0,width,height
count,55388.0,55388.0
mean,275.599913,228.240431
std,180.958418,132.049515
min,71.0,71.0
25%,127.0,118.0
50%,210.0,191.0
75%,396.0,315.0
max,640.0,640.0


Dans le jeu de validation nous avons 55 388 images. On a une variance plus faible pour la largeur et la longueur des images (180,9 et 132,04) ainsi que des images plus petites (largeur moyenne de 275 pixels et longueur moyenne de 228 pixels).