In [27]:
import os
import shutil

import pandas as pd
from PIL import Image

### Load data into a dataframe

In [28]:
df = pd.read_csv("dog_breed_photos.csv")
df.head()

Unnamed: 0,DogId,Breed,Image,ImageNumber
0,6533,AM STAFF CROSS,65db56cd-18fd-425e-a329-61587d753e81.jpg,1
1,6533,AM STAFF CROSS,745544ba-e959-4307-b556-b68f0b7fb292.jpg,2
2,6533,AM STAFF CROSS,bfa602dc-076b-41fc-9894-65565d571ee6.jpg,3
3,6533,AM STAFF CROSS,918a84e7-24c9-4081-a112-d36ca7105998.jpg,4
4,6533,AM STAFF CROSS,0e84b274-c943-492f-ac67-4ce23df8b33a.jpg,5


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117671 entries, 0 to 117670
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   DogId        117671 non-null  int64 
 1   Breed        117671 non-null  object
 2   Image        117671 non-null  object
 3   ImageNumber  117671 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 3.6+ MB


In [30]:
df.Breed.value_counts()

STAFFORDSHIRE BULL TERRIER CROSS    11084
KELPIE CROSS                         8903
GREYHOUND                            7310
BULL ARAB CROSS                      7150
AUSTRALIAN CATTLE DOG CROSS          6352
                                    ...  
ENGLISH FOXHOUND CROSS                  1
GORDON SETTER                           1
BASSET HOUND CROSS                      1
WATER SPANIEL CROSS                     1
GOLDENDOODLE CROSS                      1
Name: Breed, Length: 279, dtype: int64

### Cleaning dataset
narrow down to only pure breeds

In [31]:
# https://stackoverflow.com/a/26577689
search_for = ['CROSS', 'MIXED']
pure_breed = df[df["Breed"].str.contains('|'.join(search_for)) == False]
pure_breed_count = pure_breed["Breed"].value_counts()
pure_breed_count[pure_breed_count > 500]

GREYHOUND                              7310
STAFFORDSHIRE BULL TERRIER             1859
KELPIE                                 1410
AUSTRALIAN CATTLE DOG                   989
AMERICAN STAFFORDSHIRE BULL TERRIER     859
BULL ARAB                               857
GERMAN SHEPHERD                         839
SIBERIAN HUSKY                          805
BORDER COLLIE                           802
BOXER                                   796
JACK RUSSELL TERRIER                    691
BULL TERRIER                            558
AMERICAN BULLDOG                        506
Name: Breed, dtype: int64

taking only the breeds from "AMERICAN STAFFORDSHIRE BULL TERRIER" to "BORDER COLLIE"
("AMERICAN STAFFORDSHIRE BULL TERRIER", "BULL ARAB", "GERMAN SHEPHERD", "SIBERIAN HUSKY", "BORDER COLLIE")

In [32]:
search_for = ["AMERICAN STAFFORDSHIRE BULL TERRIER", "BULL ARAB", "GERMAN SHEPHERD", "SIBERIAN HUSKY", "BORDER COLLIE"]
dataset = pure_breed[pure_breed["Breed"].str.contains('|'.join(search_for)) == True]
dataset.head()

Unnamed: 0,DogId,Breed,Image,ImageNumber
328,6673,SIBERIAN HUSKY,4d924d95-de4c-43c0-a586-9fb19cadf905.jpg,1
329,6673,SIBERIAN HUSKY,851cc9ee-fc4d-430c-99f6-6425b8bbbe6b.jpg,2
330,6673,SIBERIAN HUSKY,7d055dfa-dc6f-4f2b-a7be-dc8bc666836f.jpg,3
331,6673,SIBERIAN HUSKY,f1f55f0a-c3d6-442b-b686-71b83b510dc5.jpg,4
332,6671,SIBERIAN HUSKY,a2e49a61-7c6a-46f0-a80e-9f32e596c1df.jpg,1


In [33]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4162 entries, 328 to 117183
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   DogId        4162 non-null   int64 
 1   Breed        4162 non-null   object
 2   Image        4162 non-null   object
 3   ImageNumber  4162 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 162.6+ KB


### Check if a file is a valid image file

In [34]:
location = r"dog_breed_photos"
broken_images = []
for image in dataset['Image']:
    try:
        Image.open(os.path.join(location, image))
    except IOError:
        broken_images.append(image)
len(broken_images)

12

In [35]:
broken_images_df = dataset[dataset['Image'].str.contains('|'.join(broken_images))]
broken_images_df

Unnamed: 0,DogId,Breed,Image,ImageNumber
108766,36586,BULL ARAB,c1a143ae-02fa-48e3-91e8-b5e78ff5069b.jpg,1
108767,36586,BULL ARAB,0d2dc25d-bdda-4c30-b739-c72ef3307deb.jpg,2
108768,36586,BULL ARAB,0dbf2135-96d1-4149-bae9-675f7f1aa360.jpg,3
108769,36586,BULL ARAB,e5a3e529-1cd0-41bd-abc4-4b8a5f30e1e9.jpg,4
108770,36586,BULL ARAB,c900d764-2227-4c7c-bc65-677334a1dc11.jpg,5
108771,36586,BULL ARAB,97f79ead-4ad5-46d8-9351-2b6b0158878a.jpg,6
114961,37869,BULL ARAB,2eb22090-51a3-44f0-a35a-32bf24f55f25.jpg,1
116613,38428,BULL ARAB,a65e1622-e75d-492d-b783-33ec2a953de2.jpg,1
116614,38428,BULL ARAB,81db1ca5-557c-4533-a469-d19dc7494672.jpg,2
116615,38428,BULL ARAB,3e749e27-1a9a-411f-a66b-0417f82ec791.jpg,3


## Delete those 12 broken images from dataset

In [36]:
dataset.drop(labels=broken_images_df.index, axis=0, inplace=True)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4150 entries, 328 to 117183
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   DogId        4150 non-null   int64 
 1   Breed        4150 non-null   object
 2   Image        4150 non-null   object
 3   ImageNumber  4150 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 162.1+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.drop(labels=broken_images_df.index, axis=0, inplace=True)


## Delete wrongly labeled Images

In [37]:
wrong_label = ['0b6d18df-b3e5-410c-89bd-20082809d669.jpg',
               '1b39e0ef-d49b-476d-9516-ba3eec6ef5d7.jpg',
               '1de403a2-db84-4f33-9974-bbfdbb4a10ea.jpg',
               '1f2b3dc0-1945-491d-a21a-dd9bd07ad0a6.jpg',
               '2b814eb2-3327-464c-b481-697a9000deec.jpg',
               '2d4c0bdd-ea93-40e4-a249-f22c7745a83b.jpg',
               '3fbc03c1-f535-4661-ae51-7ea93eeeee2a.jpg',
               '04e91b7c-efdc-4bb5-816d-542669847e8c.jpg',
               '4a696f81-9d85-478e-815d-7f91517dc6eb.jpg',
               '4cee3786-ebda-474a-8aee-32c0191b979c.jpg',
               '5d911b5b-5f82-4c9c-a74d-a7bd21de3f16.jpg',
               '5fb0618f-4919-4aae-b085-1cb712821f2e.jpg',
               '035ca613-d8a4-4cef-8325-5ed6e0c29da4.jpg',
               '61f631d0-0d65-4468-ac3e-3b614d214b21.jpg',
               '87f8cff2-a139-48b1-89b8-dbde357faf0b.jpg',
               '0412b6b2-d606-47de-b827-542c81821d72.jpg',
               '532fd80d-0ef9-4db8-a654-26da7cce5800.jpg',
               '564d2813-0bea-4d0e-b634-c58b77414480.jpg',
               '564d2813-0bea-4d0e-b634-c58b77414480.jpg',
               '840bee44-2d46-4c03-9ebb-5fc94d0f4ccf.jpg',
               '3513c486-0976-42c3-9a7e-9fc4ce8a0b8b.jpg',
               '0857594e-6308-4670-bb37-d7fefead418c.jpg',
               '2376923e-bc15-4889-949d-46d99264256b.jpg',
               '4704636a-c0d1-41a8-af7a-8ff89814b599.jpg',
               '66773738-5e80-460f-8a03-44d892c5b291.jpg',
               'a0afa062-8559-4c38-bd0c-1aa027aa18f9.jpg',
               'a3da77db-bf5c-4d43-aa85-ab779d633f67.jpg',
               'd23f2bb2-da39-48b2-b5e0-dfa839d8840a.jpg',
               'f81dc592-4d8f-4223-9a79-a534903389d6.jpg',
               'f600feee-ff0b-49ec-9e69-97b3090190f1.jpg',
               'f8372af6-93a2-4462-8f52-9b4c3c6f4af2.jpg',
               'ffebd60e-d8ba-406e-b946-d4af1916a0c5.jpg']

wrong_label_df = dataset.loc[dataset['Image'].isin(wrong_label)]
wrong_label_df

Unnamed: 0,DogId,Breed,Image,ImageNumber
58545,23606,BORDER COLLIE,1b39e0ef-d49b-476d-9516-ba3eec6ef5d7.jpg,3
58548,23606,BORDER COLLIE,04e91b7c-efdc-4bb5-816d-542669847e8c.jpg,6
63328,24815,GERMAN SHEPHERD,2b814eb2-3327-464c-b481-697a9000deec.jpg,1
63364,24844,BORDER COLLIE,4cee3786-ebda-474a-8aee-32c0191b979c.jpg,2
91231,32285,BULL ARAB,3fbc03c1-f535-4661-ae51-7ea93eeeee2a.jpg,2
115849,38233,BORDER COLLIE,87f8cff2-a139-48b1-89b8-dbde357faf0b.jpg,1


Since not all images in the folder are in the csv file only 6 where deleted.

In [38]:
dataset.drop(labels=wrong_label_df.index, axis=0, inplace=True)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4144 entries, 328 to 117183
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   DogId        4144 non-null   int64 
 1   Breed        4144 non-null   object
 2   Image        4144 non-null   object
 3   ImageNumber  4144 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.9+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.drop(labels=wrong_label_df.index, axis=0, inplace=True)


### Convert clean dataset back to csv

In [39]:
dataset.to_csv("dataset_CV.csv", index=False)

### Delete all images that are not in the dataset

In [40]:
location = r"dog_breed_photos"
to_be_deleted_images = []
for entry in os.listdir(location):
    if entry not in set(dataset["Image"]):
        to_be_deleted_images.append(entry)
to_be_deleted_images

['04e91b7c-efdc-4bb5-816d-542669847e8c.jpg',
 '0d2dc25d-bdda-4c30-b739-c72ef3307deb.jpg',
 '0dbf2135-96d1-4149-bae9-675f7f1aa360.jpg',
 '1b39e0ef-d49b-476d-9516-ba3eec6ef5d7.jpg',
 '2b814eb2-3327-464c-b481-697a9000deec.jpg',
 '2eb22090-51a3-44f0-a35a-32bf24f55f25.jpg',
 '3e749e27-1a9a-411f-a66b-0417f82ec791.jpg',
 '3fbc03c1-f535-4661-ae51-7ea93eeeee2a.jpg',
 '4cee3786-ebda-474a-8aee-32c0191b979c.jpg',
 '55b005cb-ab7e-4b19-a3d1-7ce5c10634dc.jpg',
 '81db1ca5-557c-4533-a469-d19dc7494672.jpg',
 '85347b8f-e76b-4dce-8b8e-ac5d4aacbb38.jpg',
 '87f8cff2-a139-48b1-89b8-dbde357faf0b.jpg',
 '97f79ead-4ad5-46d8-9351-2b6b0158878a.jpg',
 'a65e1622-e75d-492d-b783-33ec2a953de2.jpg',
 'c1a143ae-02fa-48e3-91e8-b5e78ff5069b.jpg',
 'c900d764-2227-4c7c-bc65-677334a1dc11.jpg',
 'e5a3e529-1cd0-41bd-abc4-4b8a5f30e1e9.jpg']

In [41]:
for file in to_be_deleted_images:
    path = os.path.join(location, file)
    if os.path.exists(path):
        os.remove(path)

# Create new directory with subdirectories of the breeds

In [None]:
directory = "image_classes"
parent_dir = ""
subdirectories = ["AMERICAN STAFFORDSHIRE BULL TERRIER", "BULL ARAB", "GERMAN SHEPHERD", "SIBERIAN HUSKY",
                  "BORDER COLLIE"]

# create parent directory
try:
    # Path
    path = os.path.join(parent_dir, directory)
    os.mkdir(path)
except OSError as error:
    print(error)

# create child directories
parent_dir = directory
for d in subdirectories:
    try:
        path = os.path.join(parent_dir, d)
        os.mkdir(path)
    except OSError as error:
        print(error)

In [None]:
def CopyImagesToCorrespondingDir(image, breed):
    from_dir = "dog_breed_photos"
    to_dir = "image_classes"
    from_path = os.path.join(from_dir, image)
    to_sub_dir = os.path.join(to_dir, breed)
    to_path = os.path.join(to_sub_dir, image)
    shutil.copy(from_path, to_path)

In [None]:
for i, row in dataset.iterrows():
    CopyImagesToCorrespondingDir(row['Image'], row['Breed'])