**Business Understanding**

In [None]:
#Load data 
#scale, resize the images
#Inspect some of the images
#display the Image classes

# Loading the Data

In [5]:
#Importing libraries
import numpy as np
import pandas as pd
from PIL import Image
import os
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import array_to_img, img_to_array,load_img
import matplotlib.pyplot as plt

In [6]:
#Loading the data

#Defining the main folder with subfolders for each class
folder_path='vehicles'

#creating an image data generator with rescaling
datagen=ImageDataGenerator(rescale=1./255)

#Loading the images from the different folders

image_data=datagen.flow_from_directory(folder_path,target_size=(224,224),batch_size=32,class_mode='categorical')

#Displaying the class labels and their indices

print(image_data.class_indices)


Found 5587 images belonging to 7 classes.
{'Auto Rickshaws': 0, 'Bikes': 1, 'Cars': 2, 'Motorcycles': 3, 'Planes': 4, 'Ships': 5, 'Trains': 6}


The dataset comprises a total of 5,587 images distributed across 7 classes. Each class has been assigned a unique label and corresponding index: Auto Rickshaws (0), Bikes (1), Cars (2), Motorcycles (3), Planes (4), Ships (5), and Trains (6).

# Data Cleaning

1. Checking for Duplicates

2. Checking for corrupt images

In [10]:
#Checking for corrupt images

# Dictionary to store counts per class
corrupt_images = {}  

# Counter for total corrupt images
total_corrupt = 0  

# Looping through class subfolders
for category in os.listdir(folder_path):  
    class_path = os.path.join(folder_path, category)
    
    if not os.path.isdir(class_path):  # Skip non-folder files
        continue

    corrupt_count = 0  # Counter for this class

    for img_name in os.listdir(class_path):  # Loop through images in class
        img_path = os.path.join(class_path, img_name)

        try:
            img = Image.open(img_path)  # Try opening the image
            img.verify()  # Check if it's valid
        except Exception as e:
            corrupt_count += 1  # Increment class counter
            total_corrupt += 1  # Increment total counter
            print(f"Corrupt image found in {category}: {img_path} - {e}")

    if corrupt_count > 0:
        corrupt_images[category] = corrupt_count  # Store in dictionary

# Print results
print("\nSummary of Corrupt Images:")
for category, count in corrupt_images.items():
    corrupt_images= print(f"{category}: {count} corrupt images")

print(f"\nTotal corrupt images found: {total_corrupt}")


Corrupt image found in Auto Rickshaws: vehicles\Auto Rickshaws\Auto Rickshaw (262).png - cannot identify image file 'vehicles\\Auto Rickshaws\\Auto Rickshaw (262).png'
Corrupt image found in Auto Rickshaws: vehicles\Auto Rickshaws\Auto Rickshaw (276).jpg - cannot identify image file 'vehicles\\Auto Rickshaws\\Auto Rickshaw (276).jpg'
Corrupt image found in Auto Rickshaws: vehicles\Auto Rickshaws\Auto Rickshaw (313).jpg - cannot identify image file 'vehicles\\Auto Rickshaws\\Auto Rickshaw (313).jpg'
Corrupt image found in Auto Rickshaws: vehicles\Auto Rickshaws\Auto Rickshaw (315).jpg - cannot identify image file 'vehicles\\Auto Rickshaws\\Auto Rickshaw (315).jpg'
Corrupt image found in Auto Rickshaws: vehicles\Auto Rickshaws\Auto Rickshaw (385).png - cannot identify image file 'vehicles\\Auto Rickshaws\\Auto Rickshaw (385).png'
Corrupt image found in Auto Rickshaws: vehicles\Auto Rickshaws\Auto Rickshaw (418).jpg - cannot identify image file 'vehicles\\Auto Rickshaws\\Auto Rickshaw (41

**Interpretation**

The dataset contains some corrupted images, including 20 from the Auto Rickshaws class and 2 from the Planes class, resulting in a total of 22 corrupted images.

In [12]:
# Displaying the corrupted images
for img_path in corrupt_images:
    try:
        img = Image.open(img_path)
        plt.figure()
        plt.imshow(img)
        plt.axis("off")
        plt.title(f"Corrupt Image: {os.path.basename(img_path)}")
        plt.show()
    except Exception as e:
        print(f"Cannot display {img_path}: {e}")


Cannot display path_to_corrupt_image1.jpg: [Errno 2] No such file or directory: 'path_to_corrupt_image1.jpg'
Cannot display path_to_corrupt_image2.jpg: [Errno 2] No such file or directory: 'path_to_corrupt_image2.jpg'
Cannot display path_to_corrupt_image3.jpg: [Errno 2] No such file or directory: 'path_to_corrupt_image3.jpg'


**Interpretation**

Displaying the corrupted images was not possible as the error displays since them being corrupted means that they are incomplete, damaged, or in an unrecognizable format.

In [13]:
#Deleting the corrupted images
for img_path in corrupt_images:
    try:
        os.remove(img_path)  # Delete the file
        print(f"Deleted: {img_path}")
    except Exception as e:
        print(f"Error deleting {img_path}: {e}")

print(f"\nTotal deleted corrupt images: {len(corrupt_images)}")

Error deleting path_to_corrupt_image1.jpg: [WinError 2] The system cannot find the file specified: 'path_to_corrupt_image1.jpg'
Error deleting path_to_corrupt_image2.jpg: [WinError 2] The system cannot find the file specified: 'path_to_corrupt_image2.jpg'
Error deleting path_to_corrupt_image3.jpg: [WinError 2] The system cannot find the file specified: 'path_to_corrupt_image3.jpg'

Total deleted corrupt images: 3
