In [1]:
import os
import pandas as pd
from PIL import Image

### Load data into a dataframe

In [68]:
df = pd.read_csv("dataset_CV.csv")
df

Unnamed: 0,DogId,Breed,Image,ImageNumber
0,9473,AMERICAN STAFFORDSHIRE BULL TERRIER,89f3c062-8928-487b-b7c0-8ab50f44641e.jpg,1
1,9473,AMERICAN STAFFORDSHIRE BULL TERRIER,e9e33b00-5ec4-4742-ac02-b9ee2a88314f.jpg,2
2,9473,AMERICAN STAFFORDSHIRE BULL TERRIER,2a81e50c-efdb-4e98-9220-d6945753731d.jpg,3
3,9473,AMERICAN STAFFORDSHIRE BULL TERRIER,0609e883-2c18-40b2-a4a5-8a90754a94f6.jpg,4
4,9548,AMERICAN STAFFORDSHIRE BULL TERRIER,10fb3092-842a-466b-a3db-3c88f624c6f4.jpg,1
...,...,...,...,...
4139,38513,BORDER COLLIE,e3957a7c-9289-4441-bdbd-d2289bef745d.jpg,5
4140,38513,BORDER COLLIE,48afd0ab-baf2-4ee8-9584-9563cec529e7.jpg,6
4141,38513,BORDER COLLIE,8f8c4ea0-7426-4188-bb6c-94fa1ecb1e33.jpg,7
4142,38513,BORDER COLLIE,cce5499f-eb04-4d8c-8d3f-c9c9bee16c5c.jpg,8


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4144 entries, 0 to 4143
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   DogId        4144 non-null   int64 
 1   Breed        4144 non-null   object
 2   Image        4144 non-null   object
 3   ImageNumber  4144 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 129.6+ KB


In [70]:
df.Breed.value_counts()

AMERICAN STAFFORDSHIRE BULL TERRIER    859
BULL ARAB                              844
GERMAN SHEPHERD                        838
SIBERIAN HUSKY                         805
BORDER COLLIE                          798
Name: Breed, dtype: int64

### Cleaning dataset
narrow down to only pure breeds

In [None]:
# https://stackoverflow.com/a/26577689
search_for = ['CROSS', 'MIXED']
pure_breed = df[df["Breed"].str.contains('|'.join(search_for)) == False]
pure_breed_count = pure_breed["Breed"].value_counts()
pure_breed_count[pure_breed_count > 500]

taking only the breeds from "AMERICAN STAFFORDSHIRE BULL TERRIER" to "BORDER COLLIE"
("AMERICAN STAFFORDSHIRE BULL TERRIER", "BULL ARAB", "GERMAN SHEPHERD", "SIBERIAN HUSKY", "BORDER COLLIE")

In [None]:
search_for = ["AMERICAN STAFFORDSHIRE BULL TERRIER", "BULL ARAB", "GERMAN SHEPHERD", "SIBERIAN HUSKY", "BORDER COLLIE"]
dataset = pure_breed[pure_breed["Breed"].str.contains('|'.join(search_for)) == True]
dataset

### Delete all images that are not in the dataset

In [None]:
location = r"dog_breed_photos"
to_be_deleted_images = []
for entry in os.listdir(location):
    if entry not in set(dataset["Image"]):
        to_be_deleted_images.append(entry)
to_be_deleted_images

In [None]:
for file in to_be_deleted_images:
    path = os.path.join(location, file)
    if os.path.exists(path):
        os.remove(path)

### Check if a file is a valid image file

In [6]:
location = r"dog_breed_photos"
broken_images = []
for image in dataset['Image']:
    try:
        Image.open(os.path.join(location, image))
    except IOError:
        broken_images.append(image)
len(broken_images)

12

In [7]:
broken_images_df = df[df['Image'].str.contains('|'.join(broken_images))]
broken_images_df

Unnamed: 0.1,Unnamed: 0,DogId,Breed,Image,ImageNumber
1604,108766,36586,BULL ARAB,c1a143ae-02fa-48e3-91e8-b5e78ff5069b.jpg,1
1605,108767,36586,BULL ARAB,0d2dc25d-bdda-4c30-b739-c72ef3307deb.jpg,2
1606,108768,36586,BULL ARAB,0dbf2135-96d1-4149-bae9-675f7f1aa360.jpg,3
1607,108769,36586,BULL ARAB,e5a3e529-1cd0-41bd-abc4-4b8a5f30e1e9.jpg,4
1608,108770,36586,BULL ARAB,c900d764-2227-4c7c-bc65-677334a1dc11.jpg,5
1609,108771,36586,BULL ARAB,97f79ead-4ad5-46d8-9351-2b6b0158878a.jpg,6
1700,114961,37869,BULL ARAB,2eb22090-51a3-44f0-a35a-32bf24f55f25.jpg,1
1703,116613,38428,BULL ARAB,a65e1622-e75d-492d-b783-33ec2a953de2.jpg,1
1704,116614,38428,BULL ARAB,81db1ca5-557c-4533-a469-d19dc7494672.jpg,2
1705,116615,38428,BULL ARAB,3e749e27-1a9a-411f-a66b-0417f82ec791.jpg,3


## Delete those 12 broken images from dataset

In [8]:
df.drop(labels=broken_images_df.index, axis=0, inplace=True)

In [71]:
df.drop('Unnamed: 0', inplace=True, axis=1)
df

KeyError: "['Unnamed: 0'] not found in axis"

## Delete wrongly labeled Images

In [73]:
wrong_label = ['0b6d18df-b3e5-410c-89bd-20082809d669.jpg', 
               '1b39e0ef-d49b-476d-9516-ba3eec6ef5d7.jpg',
              '1de403a2-db84-4f33-9974-bbfdbb4a10ea.jpg',
              '1f2b3dc0-1945-491d-a21a-dd9bd07ad0a6.jpg',
              '2b814eb2-3327-464c-b481-697a9000deec.jpg',
              '2d4c0bdd-ea93-40e4-a249-f22c7745a83b.jpg',
              '3fbc03c1-f535-4661-ae51-7ea93eeeee2a.jpg',
              '04e91b7c-efdc-4bb5-816d-542669847e8c.jpg',
              '4a696f81-9d85-478e-815d-7f91517dc6eb.jpg',
              '4cee3786-ebda-474a-8aee-32c0191b979c.jpg',
              '5d911b5b-5f82-4c9c-a74d-a7bd21de3f16.jpg',
              '5fb0618f-4919-4aae-b085-1cb712821f2e.jpg',
              '035ca613-d8a4-4cef-8325-5ed6e0c29da4.jpg',
              '61f631d0-0d65-4468-ac3e-3b614d214b21.jpg',
              '87f8cff2-a139-48b1-89b8-dbde357faf0b.jpg',
              '0412b6b2-d606-47de-b827-542c81821d72.jpg',
              '532fd80d-0ef9-4db8-a654-26da7cce5800.jpg',
              '564d2813-0bea-4d0e-b634-c58b77414480.jpg',
              '564d2813-0bea-4d0e-b634-c58b77414480.jpg',
              '840bee44-2d46-4c03-9ebb-5fc94d0f4ccf.jpg',
              '3513c486-0976-42c3-9a7e-9fc4ce8a0b8b.jpg',
              '0857594e-6308-4670-bb37-d7fefead418c.jpg',
              '2376923e-bc15-4889-949d-46d99264256b.jpg',
              '4704636a-c0d1-41a8-af7a-8ff89814b599.jpg',
              '66773738-5e80-460f-8a03-44d892c5b291.jpg',
              'a0afa062-8559-4c38-bd0c-1aa027aa18f9.jpg',
              'a3da77db-bf5c-4d43-aa85-ab779d633f67.jpg',
              'd23f2bb2-da39-48b2-b5e0-dfa839d8840a.jpg',
              'f81dc592-4d8f-4223-9a79-a534903389d6.jpg',
              'f600feee-ff0b-49ec-9e69-97b3090190f1.jpg',
              'f8372af6-93a2-4462-8f52-9b4c3c6f4af2.jpg',
              'ffebd60e-d8ba-406e-b946-d4af1916a0c5.jpg']

dataset = df.loc[df['Image'].isin(wrong_label)]
df.drop(labels=dataset.index, axis=0, inplace=True)        
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   DogId        0 non-null      int64 
 1   Breed        0 non-null      object
 2   Image        0 non-null      object
 3   ImageNumber  0 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 108.0+ bytes


Since not all images in the folder are in the csv file only 6 where deleted.

### Convert clean dataset back to csv

In [66]:
dataset.to_csv("dataset_CV.csv", index=False)